icsDataValidation 1.0.371__py3-none-any.whl → 1.0.415__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. icsDataValidation/configuration.py +0 -0
  2. icsDataValidation/connection_setups/__init__.py +0 -0
  3. icsDataValidation/connection_setups/azure_connection_setup.py +2 -1
  4. icsDataValidation/connection_setups/databricks_connection_setup.py +0 -0
  5. icsDataValidation/connection_setups/exasol_connection_setup.py +0 -0
  6. icsDataValidation/connection_setups/oracle_connection_setup.py +0 -0
  7. icsDataValidation/connection_setups/snowflake_connection_setup.py +0 -0
  8. icsDataValidation/connection_setups/sqlserver_connection_setup.py +20 -0
  9. icsDataValidation/connection_setups/teradata_connection_setup.py +0 -0
  10. icsDataValidation/core/__init__.py +0 -0
  11. icsDataValidation/core/database_objects.py +0 -0
  12. icsDataValidation/core/object_comparison.py +0 -0
  13. icsDataValidation/input_parameters/__init__.py +0 -0
  14. icsDataValidation/input_parameters/testing_tool_params.py +4 -3
  15. icsDataValidation/main.py +15 -11
  16. icsDataValidation/output_parameters/__init__.py +0 -0
  17. icsDataValidation/output_parameters/result_params.py +0 -0
  18. icsDataValidation/services/__init__.py +0 -0
  19. icsDataValidation/services/comparison_service.py +80 -76
  20. icsDataValidation/services/database_services/__init__.py +0 -0
  21. icsDataValidation/services/database_services/azure_service.py +69 -43
  22. icsDataValidation/services/database_services/databricks_hive_metastore_service.py +20 -7
  23. icsDataValidation/services/database_services/databricks_unity_catalog_service.py +20 -12
  24. icsDataValidation/services/database_services/exasol_service.py +26 -23
  25. icsDataValidation/services/database_services/oracle_service.py +64 -55
  26. icsDataValidation/services/database_services/snowflake_service.py +85 -36
  27. icsDataValidation/services/database_services/sqlserver_service.py +868 -0
  28. icsDataValidation/services/database_services/teradata_service.py +54 -37
  29. icsDataValidation/services/initialization_service.py +0 -0
  30. icsDataValidation/services/result_service.py +0 -0
  31. icsDataValidation/services/system_service.py +4 -0
  32. icsDataValidation/services/testset_service.py +0 -0
  33. icsDataValidation/utils/__init__.py +0 -0
  34. icsDataValidation/utils/file_util.py +0 -0
  35. icsDataValidation/utils/logger_util.py +0 -0
  36. icsDataValidation/utils/pandas_util.py +0 -0
  37. icsDataValidation/utils/parallelization_util.py +0 -0
  38. icsDataValidation/utils/sql_util.py +0 -0
  39. icsdatavalidation-1.0.415.dist-info/METADATA +298 -0
  40. {icsDataValidation-1.0.371.dist-info → icsdatavalidation-1.0.415.dist-info}/RECORD +18 -16
  41. {icsDataValidation-1.0.371.dist-info → icsdatavalidation-1.0.415.dist-info}/WHEEL +1 -1
  42. {icsDataValidation-1.0.371.dist-info → icsdatavalidation-1.0.415.dist-info}/top_level.txt +0 -0
  43. icsDataValidation-1.0.371.dist-info/METADATA +0 -21
@@ -110,7 +110,7 @@ class TeradataService(object):
110
110
  try:
111
111
 
112
112
  row_count = self.execute_queries(query_get_row_count).fetchall()[0][0]
113
-
113
+
114
114
  except Exception as err:
115
115
  error_list.append(str(err))
116
116
  error_list.append(query_get_row_count)
@@ -143,16 +143,17 @@ class TeradataService(object):
143
143
 
144
144
  return results
145
145
 
146
- def get_count_distincts_from_object(self, object : DatabaseObject, column_intersections: list, where_clause: str="", exclude_columns:list=[]) -> dict:
146
+ def get_count_distincts_from_object(self, object : DatabaseObject, column_intersections: list, where_clause: str="", exclude_columns:list=[],
147
+ enclose_column_by_double_quotes: bool = False) -> dict:
147
148
 
148
149
  if self.teradata_connection is None:
149
150
  self._connect_to_teradata()
150
-
151
+
151
152
  unions=""
152
153
  for column in column_intersections:
153
154
  if column not in exclude_columns:
154
155
  unions +=f"UNION SELECT CAST('{column}' AS VARCHAR(500)) AS COLUMN_NAME, COUNT(DISTINCT {column}) AS COUNT_DISTINCT FROM {object.schema}.{object.name} {where_clause}"
155
-
156
+
156
157
  query_get_count_distincts_from_object=f"{unions[5:]} ORDER BY 2;"
157
158
  error_list = []
158
159
  dict_count_distincts = []
@@ -167,7 +168,7 @@ class TeradataService(object):
167
168
  }
168
169
 
169
170
  dict_count_distincts.append(single_dict)
170
-
171
+
171
172
  except Exception as err:
172
173
  #raise err
173
174
  error_list.append(["ERROR", str(err).split('|||')[0], str(err).split('|||')[1]])
@@ -182,7 +183,8 @@ class TeradataService(object):
182
183
 
183
184
  return size
184
185
 
185
- def create_checksums(self, object: DatabaseObject, column_intersections: list, where_clause:str="", exclude_columns:list=[]) -> List[Dict]:
186
+ def create_checksums(self, object: DatabaseObject, column_intersections: list, where_clause:str="", exclude_columns:list=[],
187
+ enclose_column_by_double_quotes: bool = False) -> List[Dict]:
186
188
 
187
189
  if self.teradata_connection is None:
188
190
  self._connect_to_teradata()
@@ -212,7 +214,7 @@ class TeradataService(object):
212
214
  elif column_datatype.lower() == 'i1' and 1 == 0:
213
215
  aggregates += f", (SELECT CONCAT ((select trim(count(*)) as val FROM {object.schema}.{object.name} WHERE {column} = 1),'_',(select trim(count(*)) as val from {object.schema}.{object.name} WHERE {column} = 0))) AS aggregateboolean_{column}"
214
216
  #else: Additional Data Types: ++ TD_ANYTYPE, a1 ARRAY, AN ARRAY , bo BINARY LARGE OBJECT, us USER‑DEFINED TYPE (all types),xm XML
215
-
217
+
216
218
  query_checksums = f"select {aggregates[1:]} from {object.schema}.{object.name} {where_clause};"
217
219
 
218
220
  query_countnulls = f"select {count_nulls[1:]} from {object.schema}.{object.name} {where_clause};"
@@ -257,7 +259,19 @@ class TeradataService(object):
257
259
  return checksums
258
260
 
259
261
 
260
- def create_pandas_df_from_group_by(self, object: DatabaseObject, column_intersections: list, group_by_columns: list, group_by_aggregation_columns: list, group_by_aggregation_type: str, only_numeric: bool, where_clause: str, exclude_columns: list, numeric_scale: int=None) -> List[Dict]:
262
+ def create_pandas_df_from_group_by(
263
+ self,
264
+ object: DatabaseObject,
265
+ column_intersections: list,
266
+ group_by_columns: list,
267
+ group_by_aggregation_columns: list,
268
+ group_by_aggregation_type: str,
269
+ only_numeric: bool,
270
+ where_clause: str,
271
+ exclude_columns: list,
272
+ numeric_scale: int=None,
273
+ enclose_column_by_double_quotes: bool = False
274
+ ) -> List[Dict]:
261
275
 
262
276
  if self.teradata_connection is None:
263
277
  self._connect_to_teradata()
@@ -267,7 +281,7 @@ class TeradataService(object):
267
281
  else:
268
282
  aggregation_columns= [f"{column.upper()}" for column in column_intersections if (column in group_by_aggregation_columns and column not in exclude_columns)]
269
283
 
270
- dict_colummns_datatype_grouping=self.get_data_types_from_object(object, group_by_columns)
284
+ dict_colummns_datatype_grouping=self.get_data_types_from_object(object, group_by_columns)
271
285
 
272
286
  group_by_query_columns_string = " "
273
287
  grouping_columns_final = []
@@ -286,7 +300,7 @@ class TeradataService(object):
286
300
 
287
301
  group_by_query_columns_string = group_by_query_columns_string[:-1]
288
302
 
289
- dict_colummns_datatype=self.get_data_types_from_object(object, aggregation_columns)
303
+ dict_colummns_datatype=self.get_data_types_from_object(object, aggregation_columns)
290
304
 
291
305
  aggregates = ""
292
306
  aggregates_min = ""
@@ -294,15 +308,15 @@ class TeradataService(object):
294
308
  for column in aggregation_columns:
295
309
  column_datatype=next(x for x in dict_colummns_datatype if x["COLUMN_NAME"] == column)["DATA_TYPE"]
296
310
  column_datatype = column_datatype.split('(')[0]
297
-
311
+
298
312
  if column_datatype.lower() == 'i8' or column_datatype.lower() == 'i1' or column_datatype.lower() == 'i' or column_datatype.lower() == 'i2':
299
-
313
+
300
314
  if not numeric_scale:
301
315
  aggregates += f", sum(cast ({column} as decimal(30,0))) as sum_{column}"
302
316
  else:
303
317
  aggregates += f", CASE WHEN TRIM(TO_CHAR(CAST(ROUND(sum(cast ({column} as decimal(30,0))), {numeric_scale}) as decimal(38,{numeric_scale})), '999999999999999999.{'0'*numeric_scale}')) like '.%' THEN '0' || TRIM(TO_CHAR(CAST(ROUND(sum(cast ({column} as decimal(30,0))), {numeric_scale}) as decimal(38,{numeric_scale})), '999999999999999999.{'0'*numeric_scale}')) ELSE TRIM(TO_CHAR(CAST(ROUND(sum(cast ({column} as decimal(30,0))), {numeric_scale}) as decimal(38,{numeric_scale})), '999999999999999999.{'0'*numeric_scale}')) END as SUM_{column}"
304
318
  aggregates_min += f", CASE WHEN TRIM(TO_CHAR(CAST(ROUND(min({column}), {numeric_scale}) as decimal(38,{numeric_scale})), '999999999999999999.{'0'*numeric_scale}')) like '.%' THEN '0' || TRIM(TO_CHAR(CAST(ROUND(min({column}), {numeric_scale}) as decimal(38,{numeric_scale})), '999999999999999999.{'0'*numeric_scale}')) ELSE TRIM(TO_CHAR(CAST(ROUND(min({column}), {numeric_scale}) as decimal(38,{numeric_scale})), '999999999999999999.{'0'*numeric_scale}')) END as MIN_{column}, CASE WHEN TRIM(TO_CHAR(CAST(ROUND(max({column}), {numeric_scale}) as decimal(38,{numeric_scale})), '999999999999999999.{'0'*numeric_scale}')) like '.%' THEN '0' || TRIM(TO_CHAR(CAST(ROUND(max({column}), {numeric_scale}) as decimal(38,{numeric_scale})), '999999999999999999.{'0'*numeric_scale}')) ELSE TRIM(TO_CHAR(CAST(ROUND(max({column}), {numeric_scale}) as decimal(38,{numeric_scale})), '999999999999999999.{'0'*numeric_scale}')) END as MAX_{column}"
305
-
319
+
306
320
  elif column_datatype.lower() == 'bf' or column_datatype.lower() == 'bv' or column_datatype.lower() == 'd' or column_datatype.lower() == 'f' or column_datatype.lower() == 'dy' or column_datatype.lower() == 'dh' or column_datatype.lower() == 'dm' or column_datatype.lower() == 'ds' or column_datatype.lower() == 'hr' or column_datatype.lower() == 'hs' or column_datatype.lower() == 'mi' or column_datatype.lower() == 'ms' or column_datatype.lower() == 'mo' or column_datatype.lower() == 'sc' or column_datatype.lower() == 'yr' or column_datatype.lower() == 'ym' or column_datatype.lower() == 'n' or column_datatype.lower() == 'd' :
307
321
  if not numeric_scale:
308
322
  aggregates += f", sum(({column} )) as sum_{column}"
@@ -313,15 +327,15 @@ class TeradataService(object):
313
327
 
314
328
 
315
329
  elif not only_numeric and ( column_datatype.lower() == 'da' or column_datatype.lower() == 'pd' or column_datatype.lower() == 'pt' or column_datatype.lower() == 'pz' or column_datatype.lower() == 'pm' or column_datatype.lower() == 'at' or column_datatype.lower() == 'ts' or column_datatype.lower() == 'tz' or column_datatype.lower() == 'sz'):
316
-
330
+
317
331
  aggregates += f", count(distinct {column}) as COUNTDISTINCT_{column}"
318
332
  aggregates_min += f", min({column}) as MIN_{column}, max({column}) as MAX_{column}"
319
333
 
320
334
  elif not only_numeric and (column_datatype.lower() == 'cv' or column_datatype.lower() == 'cf' or column_datatype.lower() == 'co'):
321
-
335
+
322
336
  aggregates += f", count(distinct {column}) as COUNTDISTINCT_{column}"
323
337
  aggregates_min += f", min(TRIM({column})) as MIN_{column}, max(TRIM({column})) as MAX_{column}"
324
-
338
+
325
339
  elif not only_numeric and column_datatype.lower() == 'i1' and 1 == 0:
326
340
 
327
341
  aggregates += f", (SELECT CONCAT ((select trim(count(*)) as val FROM {object.schema}.{object.name} WHERE {column} = 1),'_',(select trim(count(*)) as val from {object.schema}.{object.name} WHERE {column} = 0))) AS AGGREGATEBOOLEAN_{column}"
@@ -367,7 +381,8 @@ class TeradataService(object):
367
381
  return group_by_aggregation_pdf, group_by_query_aggregation_string, group_by_query_columns_string, grouping_columns_final, error_dict
368
382
 
369
383
 
370
- def create_pandas_df(self, object : DatabaseObject, intersection_columns_trgt_src: list, where_clause:str="", exclude_columns:list=[]) -> pd.DataFrame:
384
+ def create_pandas_df(self, object : DatabaseObject, intersection_columns_trgt_src: list, where_clause:str="", exclude_columns:list=[],
385
+ enclose_column_by_double_quotes: bool = False) -> pd.DataFrame:
371
386
 
372
387
  if self.teradata_connection is None:
373
388
  self._connect_to_teradata()
@@ -379,8 +394,10 @@ class TeradataService(object):
379
394
  src_pdf = self.execute_queries(df_query,True)
380
395
 
381
396
  return src_pdf
382
-
383
- def create_pandas_df_from_sample(self, object: DatabaseObject, column_intersections: list, key_columns: list, where_clause: str="", exclude_columns:list=[], key_filters: dict={}, dedicated_columns: list=[], sample_count :int=10) -> List[Dict]:
397
+
398
+ def create_pandas_df_from_sample(self, object: DatabaseObject, column_intersections: list, key_columns: list, where_clause: str="", exclude_columns:list=[], key_filters: dict={}, dedicated_columns: list=[], sample_count :int=10,
399
+ numeric_scale: int = None,
400
+ enclose_column_by_double_quotes: bool = False) -> List[Dict]:
384
401
 
385
402
  if self.teradata_connection is None:
386
403
  self._connect_to_teradata()
@@ -416,12 +433,12 @@ class TeradataService(object):
416
433
  for column in dedicated_intersection:
417
434
  column_datatype=next(x for x in dict_colummns_datatype if x["COLUMN_NAME"] == column)["DATA_TYPE"]
418
435
  column_datatype = column_datatype.split('(')[0]
419
-
436
+
420
437
  if column_datatype.lower() == 'i8' or column_datatype.lower() == 'i1' or column_datatype.lower() == 'i' or column_datatype.lower() == 'i2':
421
438
  column_intersecions_new.append(f"CASE WHEN TRIM(TO_CHAR(CAST(ROUND({column}, 2) as decimal(38,2)), '999999999999999999999999.00')) like '.%' THEN '0' || TRIM(TO_CHAR(CAST(ROUND({column}, 2) as decimal(38,2)), '999999999999999999999999.00')) ELSE TRIM(TO_CHAR(CAST(ROUND({column}, 2) as decimal(38,2)), '999999999999999999999999.00')) END as {column}")
422
439
  used_columns.append(column)
423
440
  numeric_columns.append(column)
424
-
441
+
425
442
  elif column_datatype.lower() == 'bf' or column_datatype.lower() == 'bv' or column_datatype.lower() == 'd' or column_datatype.lower() == 'f' or column_datatype.lower() == 'dy' or column_datatype.lower() == 'dh' or column_datatype.lower() == 'dm' or column_datatype.lower() == 'ds' or column_datatype.lower() == 'hr' or column_datatype.lower() == 'hs' or column_datatype.lower() == 'mi' or column_datatype.lower() == 'ms' or column_datatype.lower() == 'mo' or column_datatype.lower() == 'sc' or column_datatype.lower() == 'yr' or column_datatype.lower() == 'ym' or column_datatype.lower() == 'n' or column_datatype.lower() == 'd' :
426
443
  column_intersecions_new.append(f"CASE WHEN TRIM(TO_CHAR(CAST(ROUND({column}, 2) as decimal(38,2)), '999999999999999999999999.00')) like '.%' THEN '0' || TRIM(TO_CHAR(CAST(ROUND({column}, 2) as decimal(38,2)), '999999999999999999999999.00')) ELSE TRIM(TO_CHAR(CAST(ROUND({column}, 2) as decimal(38,2)), '999999999999999999999999.00')) END as {column}")
427
444
  used_columns.append(column)
@@ -452,22 +469,22 @@ class TeradataService(object):
452
469
  values = list(key_filters.values())
453
470
  if values[0] == []:
454
471
  sample_query = f"SELECT {columns} FROM {object.schema}.{object.name} SAMPLE {sample_count} {where_clause} ORDER BY {keys};"
455
- else:
472
+ else:
456
473
  where_clause = f'{where_clause} AND (('
457
474
  print(key_filters)
458
475
  for j in range(len(values[0])):
459
476
  for key in key_filters.keys():
460
477
  if key == 'TECH_ID' or key in numeric_columns:
461
- where_clause += f" CAST(ROUND({key}, 2) as decimal(38,2)) = {str(key_filters[key][j])} AND"
462
- else:
463
- where_clause += f" {key} = '{str(key_filters[key][j])}' AND"
478
+ where_clause += f" CAST(ROUND({key}, 2) as decimal(38,2)) = {str(key_filters[key][j])} AND"
479
+ else:
480
+ where_clause += f" {key} = '{str(key_filters[key][j])}' AND"
464
481
  where_clause = f" {where_clause[:-3]}) OR ("
465
482
  where_clause = f"{where_clause[:-4]})"
466
483
 
467
484
  sample_query = f"SELECT {columns} FROM {object.schema}.{object.name} SAMPLE {sample_count} {where_clause} ORDER BY {keys};"
468
485
  else:
469
486
  sample_query = f"SELECT {columns} FROM {object.schema}.{object.name} SAMPLE {sample_count} {where_clause} ORDER BY {keys};"
470
-
487
+
471
488
  elif key_intersection != [] and not is_dedicated:
472
489
  column_intersecions_new = []
473
490
  used_columns = []
@@ -477,13 +494,13 @@ class TeradataService(object):
477
494
  for column in column_intersections:
478
495
  column_datatype=next(x for x in dict_colummns_datatype if x["COLUMN_NAME"] == column)["DATA_TYPE"]
479
496
  column_datatype = column_datatype.split('(')[0]
480
-
497
+
481
498
  if column_datatype.lower() == 'i8' or column_datatype.lower() == 'i1' or column_datatype.lower() == 'i' or column_datatype.lower() == 'i2':
482
499
  #TODO FFR - negativer Fall
483
500
  column_intersecions_new.append(f"CASE WHEN TRIM(TO_CHAR(CAST(ROUND({column}, 2) as decimal(38,2)), '999999999999999999999999.00')) like '.%' THEN '0' || TRIM(TO_CHAR(CAST(ROUND({column}, 2) as decimal(38,2)), '999999999999999999999999.00')) ELSE TRIM(TO_CHAR(CAST(ROUND({column}, 2) as decimal(38,2)), '999999999999999999999999.00')) END as {column}")
484
501
  used_columns.append(column)
485
502
  numeric_columns.append(column)
486
-
503
+
487
504
  elif column_datatype.lower() == 'bf' or column_datatype.lower() == 'bv' or column_datatype.lower() == 'd' or column_datatype.lower() == 'f' or column_datatype.lower() == 'dy' or column_datatype.lower() == 'dh' or column_datatype.lower() == 'dm' or column_datatype.lower() == 'ds' or column_datatype.lower() == 'hr' or column_datatype.lower() == 'hs' or column_datatype.lower() == 'mi' or column_datatype.lower() == 'ms' or column_datatype.lower() == 'mo' or column_datatype.lower() == 'sc' or column_datatype.lower() == 'yr' or column_datatype.lower() == 'ym' or column_datatype.lower() == 'n' or column_datatype.lower() == 'd' :
488
505
  column_intersecions_new.append(f"CASE WHEN TRIM(TO_CHAR(CAST(ROUND({column}, 2) as decimal(38,2)), '999999999999999999999999.00')) like '.%' THEN '0' || TRIM(TO_CHAR(CAST(ROUND({column}, 2) as decimal(38,2)), '999999999999999999999999.00')) ELSE TRIM(TO_CHAR(CAST(ROUND({column}, 2) as decimal(38,2)), '999999999999999999999999.00')) END as {column}")
489
506
  used_columns.append(column)
@@ -504,7 +521,7 @@ class TeradataService(object):
504
521
  columns = columns[:-2]
505
522
  keys = str(key_intersection)[1:-1].replace("'", "")
506
523
 
507
-
524
+
508
525
  if key_filters == {}:
509
526
  sample_query = f"SELECT {columns} FROM {object.schema}.{object.name} SAMPLE {sample_count} {where_clause} ORDER BY {keys};"
510
527
  else:
@@ -525,7 +542,7 @@ class TeradataService(object):
525
542
  # where_clause += " in " + in_clause
526
543
  if values[0] == []:
527
544
  sample_query = f"SELECT {columns} FROM {object.schema}.{object.name} SAMPLE {sample_count} {where_clause} ORDER BY {keys};"
528
- else:
545
+ else:
529
546
  where_clause = f'{where_clause} AND (('
530
547
  print(key_filters)
531
548
  for j in range(len(values[0])):
@@ -533,7 +550,7 @@ class TeradataService(object):
533
550
  if key_filters.keys() in numeric_columns:
534
551
  where_clause += f" {key} = {str(key_filters[key][j])} AND"
535
552
  else:
536
- where_clause += f" {key} = '{str(key_filters[key][j])}' AND"
553
+ where_clause += f" {key} = '{str(key_filters[key][j])}' AND"
537
554
  where_clause += f" {where_clause[:-3]}) OR ("
538
555
  where_clause = f"{where_clause[:-4]})"
539
556
 
@@ -552,17 +569,17 @@ class TeradataService(object):
552
569
  print(dict_colummns_datatype)
553
570
  column_datatype=next(x for x in dict_colummns_datatype if x["COLUMN_NAME"] == column)["DATA_TYPE"]
554
571
  column_datatype = column_datatype.split('(')[0]
555
-
572
+
556
573
  if column_datatype.lower() == 'i8' or column_datatype.lower() == 'i1' or column_datatype.lower() == 'i' or column_datatype.lower() == 'i2':
557
574
  column_intersecions_new.append(f"CASE WHEN TRIM(TO_CHAR(CAST(ROUND({column}, 2) as decimal(38,2)), '999999999999999999999999.00')) like '.%' THEN '0' || TRIM(TO_CHAR(CAST(ROUND({column}, 2) as decimal(38,2)), '999999999999999999999999.00')) ELSE TRIM(TO_CHAR(CAST(ROUND({column}, 2) as decimal(38,2)), '999999999999999999999999.00')) END as {column}")
558
- used_columns.append(column)
575
+ used_columns.append(column)
559
576
  numeric_columns.append(column)
560
-
577
+
561
578
  elif column_datatype.lower() == 'bf' or column_datatype.lower() == 'bv' or column_datatype.lower() == 'd' or column_datatype.lower() == 'f' or column_datatype.lower() == 'dy' or column_datatype.lower() == 'dh' or column_datatype.lower() == 'dm' or column_datatype.lower() == 'ds' or column_datatype.lower() == 'hr' or column_datatype.lower() == 'hs' or column_datatype.lower() == 'mi' or column_datatype.lower() == 'ms' or column_datatype.lower() == 'mo' or column_datatype.lower() == 'sc' or column_datatype.lower() == 'yr' or column_datatype.lower() == 'ym' or column_datatype.lower() == 'n' or column_datatype.lower() == 'd' :
562
579
  column_intersecions_new.append(f"CASE WHEN TRIM(TO_CHAR(CAST(ROUND({column}, 2) as decimal(38,2)), '999999999999999999999999.00')) like '.%' THEN '0' || TRIM(TO_CHAR(CAST(ROUND({column}, 2) as decimal(38,2)), '999999999999999999999999.00')) ELSE TRIM(TO_CHAR(CAST(ROUND({column}, 2) as decimal(38,2)), '999999999999999999999999.00')) END as {column}")
563
- used_columns.append(column)
580
+ used_columns.append(column)
564
581
  numeric_columns.append(column)
565
-
582
+
566
583
  elif column_datatype.lower() == 'cv' or column_datatype.lower() == 'cf' or column_datatype.lower() == 'cf':
567
584
  column_intersecions_new.append(f'TRIM({column}) as decimal(38,2)) AS {column}')
568
585
  used_columns.append(column)
@@ -662,4 +679,4 @@ class TeradataService(object):
662
679
  _ = self.teradata_connection.execute(stripped_statement)
663
680
 
664
681
  except Exception as err:
665
- raise Exception(self._get_error_message(err, single_statement)) from err
682
+ raise Exception(self._get_error_message(err, single_statement)) from err
File without changes
File without changes
@@ -1,6 +1,7 @@
1
1
  from icsDataValidation.connection_setups.snowflake_connection_setup import load_snowflake_credentials
2
2
  from icsDataValidation.connection_setups.exasol_connection_setup import load_exasol_credentials
3
3
  from icsDataValidation.connection_setups.azure_connection_setup import load_azure_credentials
4
+ from icsDataValidation.connection_setups.sqlserver_connection_setup import load_sqlserver_credentials
4
5
  from icsDataValidation.connection_setups.teradata_connection_setup import load_teradata_credentials
5
6
  from icsDataValidation.connection_setups.oracle_connection_setup import load_oracle_credentials
6
7
  from icsDataValidation.connection_setups.databricks_connection_setup import load_databricks_credentials
@@ -8,6 +9,7 @@ from icsDataValidation.services.database_services.snowflake_service import Snowf
8
9
  from icsDataValidation.services.database_services.teradata_service import TeradataService
9
10
  from icsDataValidation.services.database_services.exasol_service import ExasolService
10
11
  from icsDataValidation.services.database_services.azure_service import AzureService
12
+ from icsDataValidation.services.database_services.sqlserver_service import SQLServerService
11
13
  from icsDataValidation.services.database_services.oracle_service import OracleService
12
14
  from icsDataValidation.services.database_services.databricks_hive_metastore_service import DatabricksHiveMetastoreService
13
15
  from icsDataValidation.services.database_services.databricks_unity_catalog_service import DatabricksUnityCatalogService
@@ -33,6 +35,7 @@ class SystemService:
33
35
  "SNOWFLAKE": load_snowflake_credentials,
34
36
  "EXASOL": load_exasol_credentials,
35
37
  "AZURE": load_azure_credentials,
38
+ "SQLSERVER": load_sqlserver_credentials,
36
39
  "TERADATA": load_teradata_credentials,
37
40
  "ORACLE": load_oracle_credentials,
38
41
  "DATABRICKS_HIVE_METASTORE": load_databricks_credentials,
@@ -52,6 +55,7 @@ class SystemService:
52
55
  "SNOWFLAKE": SnowflakeService,
53
56
  "EXASOL": ExasolService,
54
57
  "AZURE": AzureService,
58
+ "SQLSERVER": SQLServerService,
55
59
  "TERADATA": TeradataService,
56
60
  "ORACLE": OracleService,
57
61
  "DATABRICKS_HIVE_METASTORE": DatabricksHiveMetastoreService,
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
@@ -0,0 +1,298 @@
1
+ Metadata-Version: 2.4
2
+ Name: icsDataValidation
3
+ Version: 1.0.415
4
+ Summary: Add your description here
5
+ Author-email: initions <ICSMC_EXT_PYPIORG@accenture.com>
6
+ License: MIT
7
+ Requires-Python: >=3.11
8
+ Description-Content-Type: text/markdown
9
+ Requires-Dist: azure-storage-blob==12.13.1
10
+ Requires-Dist: boto3==1.26.154
11
+ Requires-Dist: cloe-util-snowflake-connector==1.0.5
12
+ Requires-Dist: databricks-sdk==0.29.0
13
+ Requires-Dist: databricks-sql-connector==3.0.1
14
+ Requires-Dist: numpy==1.26.3
15
+ Requires-Dist: oracledb==2.5.0
16
+ Requires-Dist: pandas==2.2.2
17
+ Requires-Dist: pyexasol==0.24.0
18
+ Requires-Dist: pyodbc
19
+ Requires-Dist: python-dotenv>=1.0.1
20
+ Requires-Dist: teradatasql==17.20.0.10
21
+
22
+ # icsDV - initions Data Validation Tool
23
+
24
+ ## Introduction
25
+
26
+ The icsDataValidation tool identifies data mismatches between two databases.
27
+ The functionalities are specifically geared to support migration projects.
28
+ It helps to find data issues in tables and views in comparison of a source and a target system.
29
+
30
+ ### What is "generic" about the tool?
31
+
32
+ The icsDataValidation tool (icsDV) is in particular structered in a way that it is easily expandable.
33
+ The main code is used by all different database options.
34
+ Specifics for each supported database are implemented in a database service per database.
35
+
36
+ The different database services are very similar.
37
+ They hold the same methods with the same input and output parameters.
38
+ Each method is aligned with the syntax and the settings of the database it is created for.
39
+ Each core implementation includes connections setup, object comparison functionality and the result preparation.
40
+
41
+ ### Supported Databases
42
+
43
+ The icsDV supports comparisons between the following databases:
44
+
45
+ - Snowflake
46
+ - Teradata
47
+ - Azure SQL Server
48
+ - Exasol
49
+ - Oracle
50
+ - Databricks with and without Unity Catalog
51
+
52
+ Comparison results can be written to either Snowflake or Databricks.
53
+
54
+ ### Features
55
+
56
+ The key features of the tool are:
57
+
58
+ - Comparison of tables and views between a source and a target system.
59
+ - Pipeline integration in Azure DevOps or GitLab
60
+ - Multiple verification/comparison steps:
61
+ - Row count comparison
62
+ - Column names comparison
63
+ - Aggregation comparison (depending on data type)
64
+ - "group by" comparison
65
+ - Pandas DataFrame comparison (with a threshold for the size of the object)
66
+ - Pandas DataFrame sample comparison (with a random sample of the object)
67
+ - Detailed representation of the comparison result
68
+ - "high-level" result (for each pipeline/execution)
69
+ - "object-level" result (for each table/view)
70
+ - "column-level" result (for each column)
71
+ - Parallelization for performance enhancement of the comparison of a large number of objects
72
+ - Input testsets (white-listing of objects)
73
+ - Object filter (black-listing of objects)
74
+ - Object mappings between the source and the target system
75
+ - Comparison result saved and displayed in multiple instances
76
+ - saved as JSON files in the repository
77
+ - export to result tables in the target system (Snowflake or Databricks)
78
+ - export to Azure Blob Storage or AWS S3 Bucket
79
+
80
+ ### Repository Structure
81
+
82
+ The repository is structured in the following sections:
83
+
84
+ - **icsDataValidation**
85
+ > This is where all code files are stored.
86
+
87
+ - **icsDataValidation/main.py**
88
+ > Entry point for python.
89
+
90
+ - **icsDataValidation/core**
91
+ > Main code files for the parts independent on the source and target system.
92
+
93
+ - **icsDataValidation/services/database_services**
94
+ > Database services for all supported systems can be found here.
95
+ Each file contains a class that is identically structured in comparison to the other database service classes.
96
+ Each database service class contains methods to query metadata, create aggregations, and retrieve data for the comparison step.
97
+
98
+ - **icsDataValidation/connection_setups**
99
+ > The connection setups are database dependent.
100
+ They define how the credentials for the database connections are retrieved.
101
+
102
+ - **examples/comparison_results**
103
+ > The comparison results are saved here.
104
+ One JSON file with all results is saved for each execution/pipeline run.
105
+ Additionally there are live comparison results saved for each compared object as a failsafe.
106
+
107
+ - **examples**
108
+ > This folder contains all files defining a specific validation setup.
109
+ - A file named `migration_config.json` contains configurations about the source system, the target system and the mapping of objects between both. It contains the blacklists and "group by" aggregation settings.
110
+ - A file named `ics_data_validation_config.json` specifies the source system, the target system and the results system. Most importantly, this includes the names of the results tables and the connection configurations (Server, Port, Secrets) of source and target system.
111
+ - A file named `manual_execution_params.py` is only relevant for local execution of the code. It contains settings which would otherwise be defined in the pipeline setup, i.e. limits on the size of objects to compare and the numeric precision.
112
+ - The folder `testsets` contains JSON files specifying whitelists of objects to compare.
113
+
114
+ For all the files here, empty `*.template.*` files are available and may serve as a starting point.
115
+ This repo stores only template files.
116
+ The actual files used for each setup should not be committed here.
117
+ They are stored in [a separate repository.](https://dev.azure.com/initions-consulting/icsDataValidation/_git/icsDataValidation%20-%20workflow%20demo).
118
+
119
+ - **examples/pipeline**
120
+ > Files defining the pipelines that execute the icsDV are stored here. For example, YML files for Azure DevOps pipelines.
121
+
122
+ ## icsDV - Execution Manual
123
+
124
+ ## icsDV - Input Parameters
125
+
126
+ There are four types of input parameters:
127
+
128
+ 1. Pipeline Parameters - which are defined as input parameters of a pipeline (Azure DevOps Pipeline or Gitlab Pipeline).
129
+ 2. Manual Execution Parameters - defined in the code (testing_tool.py).
130
+ They correspond to the Pipeline Parameters and are used when executing the code directly without a pipeline instead of the Pipeline Parameters.
131
+ 3. Global Parameters - directly defined in the TestingToolParams class. They are used in pipeline runs and for manual executions.
132
+ 4. Environmental Parameters - Stored either in Azure DevOps in a variable group, in Gitlab, or, for manual executions, in a `*.env` file in a location that can be specified in the `manual_execution_params.py`.
133
+
134
+ Additionally the parameters can be categorized into 3 groups:
135
+
136
+ 1. Setup Parameters - these are parameters which are usually just set once when setting up the icsDV.
137
+ 2. Configuration Parameters - are used to configure the general settings but can be adjusted to the conditions of the workload on the fly.
138
+ 3. Execution Parameters - are set individually for each execution of the icsDV, e.g. the selection of objects to be tested.
139
+
140
+ ### Setup Parameters
141
+
142
+ Stored in `ics_data_validation_config.json`:
143
+
144
+ | Parameter | Description | Input Type |
145
+ |---------------------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|--------------------------------------------------|
146
+ | source_system_selection | Name of the source system as defined in the database_config.json as a key. | Pipeline Parameter or Manual Execution Parameter |
147
+ | target_system_selection | Name of the target system as defined in the database_config.json as a key. | Pipeline Parameter or Manual Execution Parameter |
148
+ | result_system_selection | Name of the result system as defined in the database_config.json as a key. | Pipeline Parameter or Manual Execution Parameter |
149
+ | azure_devops_pipeline | Azure DevOps Pipeline support. Set to "True" to push the changes of a run to the GIT repository. | Global Parameter - TestingToolParams |
150
+ | gitlab_pipeline | Gitlab Pipeline support. Set to "True" to push the changes of a run to the GIT repository. | Global Parameter - TestingToolParams |
151
+ | result_database_name | Name of the database or catalog the results are written to | Global Parameter - TestingToolParams |
152
+ | result_schema_name | Name of the schema the results are written to | Global Parameter - TestingToolParams |
153
+ | result_table_highlevel_name | Name of the high-level results table | Global Parameter - TestingToolParams |
154
+ | result_table_objectlevel_name | Name of the object-level results table | Global Parameter - TestingToolParams |
155
+ | result_table_columnlevel_name | Name of the column-level results table | Global Parameter - TestingToolParams |
156
+ | result_meta_data_schema_name | Name of the schema the full results are written to | Global Parameter - TestingToolParams |
157
+ | result_table_name | Name of the table the full results are written to | Global Parameter - TestingToolParams |
158
+ | result_live_table_name | Name of the table the live results are written to | Global Parameter - TestingToolParams |
159
+ | results_folder_name | Folder that in which the results are stored in JSON format. Default: `examples/comparison_results/` | Global Parameter - TestingToolParams |
160
+ | remaining_mapping_objects_folder_name | Output folder that holds information about source system objects which are not covered by the mapping and are therefor not included in the comparison. Default: `examples/remaining_mapping_objects/` | Global Parameter - TestingToolParams |
161
+ | testset_folder_name | Folder that holds the test set files in JSON format. Default: `examples/testsets/` | Global Parameter - TestingToolParams |
162
+ | stage_schema | Name of the Snowflake Schema where the stage is created to upload the comparison results to Snowflake. Only needed if the `upload_result_to_result_database` functionality is used with Snowflake as target system. | Global Parameter - TestingToolParams |
163
+ | stage_name_prefix | Prefix of the name of the Snowflake Stage which is used to upload the comparison results to Snowflake. The name is complemented by a run_guid which is a unique uuid for each icsDV execution. Only needed if the `upload_result_to_result_database` functionality is used. | Global Parameter - TestingToolParams |
164
+ | container_name | Name of the Azure Storage Container to upload the comparison results into the blob storage. Note: Only needed if the `upload_result_to_blob` functionality is used. | Global Parameter - TestingToolParams |
165
+ | bucket_name | Name of the AWS S3 Bucket to upload the comparison results into the AWS. Note: Only needed if the `upload_result_to_bucket` functionality is used. | Global Parameter - TestingToolParams |
166
+
167
+ ### Configuration Parameters
168
+
169
+ Stored in `manual_execution_params.py`:
170
+
171
+ | Parameter | Description | Input Type |
172
+ |----------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------|---------------------------------------------------|
173
+ | ENV_FILEPATH | Absolute path to the `*.env` file containing secrets, passwords and tokens. | Pipeline Parameter or Manual Execution Parameters |
174
+ | UPLOAD_RESULT_TO_BLOB | Set to "True" to upload the comparison results to an Azure Blob Storage. An `azure_storage_connection_string` is needed if set to "True". | Pipeline Parameter or Manual Execution Parameters |
175
+ | UPLOAD_RESULT_TO_BUCKET | Set to "True" to upload the comparison results to an AWS S3 Bucket. An `aws_bucket_access_key` and an `aws_bucket_secret_key` is needed if set to "True". | Pipeline Parameter or Manual Execution Parameter |
176
+ | UPLOAD_RESULT_TO_RESULT_DATABASE | Set to "True" to upload the comparison results to Snowflake or Databricks. A `result_system_selection` is needed if set to "True". | Pipeline Parameter or Manual Execution Parameter |
177
+ | MAX_OBJECT_SIZE | Limits Pandas comparison to objects of a size smaller than `MAX_OBJECT_SIZE` bytes. Data type is String. Default: `str(-1)`, no limit. | Pipeline Parameter or Manual Execution Parameter |
178
+ | MAX_ROW_NUMBER | Limits Pandas comparison to objects with less than `MAX_ROW_NUMBER` rows. Data type is String. Default: `str(-1)`, no limit. | Pipeline Parameter or Manual Execution Parameter |
179
+ | EXECUTE_GROUP_BY_COMPARISON | Set to "True" to execute group-by comparisons. See sec. "Group-By-Aggregation" for details. | Pipeline Parameter or Manual Execution Parameter |
180
+ | USE_GROUP_BY_COLUMNS | Set to "True" to activate group-by columns. See sec. "Group-By-Aggregation" for details. | Pipeline Parameter or Manual Execution Parameter |
181
+ | MIN_GROUP_BY_COUNT_DISTINCT | Minimum expected number of group-by counts. See sec. "Group-By-Aggregation" for details. | Pipeline Parameter or Manual Execution Parameter |
182
+ | MAX_GROUP_BY_COUNT_DISTINCT | Maximum expected number of group-by counts. See sec. "Group-By-Aggregation" for details. | Pipeline Parameter or Manual Execution Parameter |
183
+ | MAX_GROUP_BY_SIZE | Maximum size of the group-by query. See sec. "Group-By-Aggregation" for details. | Pipeline Parameter or Manual Execution Parameter |
184
+ | NUMERIC_SCALE | Number of digits to compare. Data type is String. Default: `str(2)`, i.e. deviations below 0.01 are tolerated. | Pipeline Parameter or Manual Execution Parameter |
185
+
186
+ ### Execution Parameters
187
+
188
+ Stored in `manual_execution_params.py`:
189
+
190
+ | Parameter | Description | Input Type |
191
+ |-------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------|--------------------------------------------------|
192
+ | DATABASE_NAME | Filters the test set on a specific database/catalog. For no filter set "None" as a Manual Execution Parameter and leave it empty as a Pipeline Parameter. | Pipeline Parameter or Manual Execution Parameter |
193
+ | SCHEMA_NAME | Filters the test set on a specific schema. For no filter set "None" as a Manual Execution Parameter and leave it empty as a Pipeline Parameter. | Pipeline Parameter or Manual Execution Parameter |
194
+ | TESTSET_FILE_NAMES | File names of the test set as defined in the folder testset_folder_name (see Setup Parameters) as JSON files. | Pipeline Parameter or Manual Execution Parameter |
195
+ | OBJECT_TYPE_RESTRICTION | Filters the testset to only tables (`"include_only_tables"`), only views (`"include_only_views"`) or all tables and views (`"include_all"`). | Pipeline Parameter or Manual Execution Parameter |
196
+ | MAX_NUMBER_OF_THREADS | Maximum number of threads used. Values larget than the default, `str(1)`, activate parallelization. | Pipeline Parameter or Manual Execution Parameter |
197
+
198
+ ## icsDV - Configuration
199
+
200
+ ### Blacklists
201
+
202
+ ### Whitelists (Testsets)
203
+
204
+ ### Mapping
205
+
206
+ ### Group-By-Aggregation
207
+
208
+ The Group-By-Aggregation is a feature to pinpoint the differences in the data.
209
+ It can be activiated by setting the parameter `EXECUTE_GROUP_BY_COMPARISON` to TRUE.
210
+ If activated an additional comparison step is performed.
211
+ Each table is queried with a group-by-statement including aggregations depending on the data type.
212
+ Those aggregations are consequently compared.
213
+ As a result the differences in the data can be narrowed down to certain grouping values.
214
+
215
+ There are three options to define the column over which the group-by is executed.
216
+
217
+ 1. "group-by-columns-per-table" defined as multiple lists for specific tables. Activated with the `USE_GROUP_BY_COLUMNS` parameter and `GROUP_BY_COLUMNS_PER_TABLE` defined in the `migration_config.json`.
218
+ 2. "group-by-columns" from a predifined list for all tables by a validation. Activated with the `USE_GROUP_BY_COLUMNS` parameter and `GROUP_BY_COLUMNS` defined in the `migration_config.json`.
219
+ 3. "group-by-columns" evaluated from all existing columns by a validation
220
+
221
+ The validation consists of a number of tests and can be configured by a number of parameters to either easily find columns to group by over or to only select columns which add a definite value for pinpointing the differences in the data.
222
+
223
+ The validation tests for the "group-by-columns" are:
224
+
225
+ 1. Number of distinct values of the column is more than 1.
226
+ 2. Number of distinct values of the column is less than the rowcount of the table.
227
+ 3. Number of distinct values of the column exceeds the `MIN_GROUP_BY_COUNT_DISTINCT` parameter.
228
+ 4. Number of distinct values of the column is below the `MAX_GROUP_BY_COUNT_DISTINCT` parameter.
229
+ 5. The size of the expected result of the group-by-query is below the `MAX_GROUP_BY_SIZE` parameter.
230
+ (The size is defined by "Number of distinct values" * "Number of columns")
231
+
232
+ All tests are executed on source and target.
233
+
234
+ > Note: The group by comparison can be activated by setting the `execute_group_by_comparison` parameter to TRUE.
235
+ The `migration_config.json` has to include the follwing keys when the parameter use_group_by_columns is set to TRUE.
236
+
237
+ "GROUP_BY_AGGREGATION":{
238
+ "GROUP_BY_COLUMNS_PER_TABLE": {},
239
+ "GROUP_BY_COLUMNS":[]
240
+ }
241
+ The values of those keys can be empty.
242
+
243
+ ## icsDV - Comparison Results
244
+
245
+ ### JSON Results
246
+
247
+ - Complete Comparison Result JSONs
248
+ - Live Comparison Result JSONs
249
+
250
+ ### Target System Result Tables
251
+
252
+ - High-Level Result
253
+ - Object-Level Result
254
+ - Column-Level Result
255
+
256
+ ### Result Export in a File Storage
257
+
258
+ ## icsDV - Setup
259
+
260
+ ### Code setup
261
+
262
+ - To handle the code, we recommend using VS Code.
263
+ - The code is written in python. The tool is compatible with version 3.11
264
+ - It is recommended to use a project-specific python environment.
265
+ You can create one with `python -m venv .env` in the root folder of this repo.
266
+ After creating it, you should activate it (`source .env/bin/activate`), select the python binary `.env/bin/python` therein as your python interpreter in VSC and make sure that python libraries are read from and installed to this environment, i.e. `export PYTHONPATH=$(pwd)/.env/lib/python3.8/site-packages`.
267
+ - In this environment, install the packages listed in the `requirements.txt` and the `requirements-dev.txt`. i.e. run `pip install -r requirements.txt`.
268
+
269
+ ### Setup for manual execution
270
+
271
+ ### Setup as Azure DevOps pipeline
272
+
273
+ ### Setup as GitLab pipeline
274
+
275
+ ## authentication
276
+
277
+ The following auth methods to snowflake are supported:
278
+
279
+ - password, provided via PASSWORD_NAME
280
+ - private key with/without encryption, provided via PRIVATE_KEY_NAME with/without PRIVATE_KEY_PASSPHRASE_NAME
281
+ - path to private key file with/without encryption, provided via PRIVATE_KEY_FILE_PATH with/without PRIVATE_KEY_FILE_PASSWORD
282
+
283
+ ## devcontainer
284
+
285
+ run with uv as follows in devcontainer:
286
+ ```bash
287
+ uv run -s icsDataValidation/main.py
288
+ ```
289
+
290
+ Inside the [devcontainer config](.devcontainer/devcontainer.json) the mounts setting is used to bring a .env from the host system into the devcontainer.
291
+
292
+ ```bash
293
+ "mounts": [
294
+ "source=/home/Documents/Generic_Testing_Tool/generic_testing_tool_password.env,target=/workspaces/icsDataValidation/examples/generic_testing_tool_password.env,type=bind"
295
+ ]
296
+ ```
297
+
298
+ To use this feature either create the .env under the source path on your host or adjust this path to another path on the host system. The target path do no need adjustment!