icsDataValidation 1.0.371__py3-none-any.whl → 1.0.415__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. icsDataValidation/configuration.py +0 -0
  2. icsDataValidation/connection_setups/__init__.py +0 -0
  3. icsDataValidation/connection_setups/azure_connection_setup.py +2 -1
  4. icsDataValidation/connection_setups/databricks_connection_setup.py +0 -0
  5. icsDataValidation/connection_setups/exasol_connection_setup.py +0 -0
  6. icsDataValidation/connection_setups/oracle_connection_setup.py +0 -0
  7. icsDataValidation/connection_setups/snowflake_connection_setup.py +0 -0
  8. icsDataValidation/connection_setups/sqlserver_connection_setup.py +20 -0
  9. icsDataValidation/connection_setups/teradata_connection_setup.py +0 -0
  10. icsDataValidation/core/__init__.py +0 -0
  11. icsDataValidation/core/database_objects.py +0 -0
  12. icsDataValidation/core/object_comparison.py +0 -0
  13. icsDataValidation/input_parameters/__init__.py +0 -0
  14. icsDataValidation/input_parameters/testing_tool_params.py +4 -3
  15. icsDataValidation/main.py +15 -11
  16. icsDataValidation/output_parameters/__init__.py +0 -0
  17. icsDataValidation/output_parameters/result_params.py +0 -0
  18. icsDataValidation/services/__init__.py +0 -0
  19. icsDataValidation/services/comparison_service.py +80 -76
  20. icsDataValidation/services/database_services/__init__.py +0 -0
  21. icsDataValidation/services/database_services/azure_service.py +69 -43
  22. icsDataValidation/services/database_services/databricks_hive_metastore_service.py +20 -7
  23. icsDataValidation/services/database_services/databricks_unity_catalog_service.py +20 -12
  24. icsDataValidation/services/database_services/exasol_service.py +26 -23
  25. icsDataValidation/services/database_services/oracle_service.py +64 -55
  26. icsDataValidation/services/database_services/snowflake_service.py +85 -36
  27. icsDataValidation/services/database_services/sqlserver_service.py +868 -0
  28. icsDataValidation/services/database_services/teradata_service.py +54 -37
  29. icsDataValidation/services/initialization_service.py +0 -0
  30. icsDataValidation/services/result_service.py +0 -0
  31. icsDataValidation/services/system_service.py +4 -0
  32. icsDataValidation/services/testset_service.py +0 -0
  33. icsDataValidation/utils/__init__.py +0 -0
  34. icsDataValidation/utils/file_util.py +0 -0
  35. icsDataValidation/utils/logger_util.py +0 -0
  36. icsDataValidation/utils/pandas_util.py +0 -0
  37. icsDataValidation/utils/parallelization_util.py +0 -0
  38. icsDataValidation/utils/sql_util.py +0 -0
  39. icsdatavalidation-1.0.415.dist-info/METADATA +298 -0
  40. {icsDataValidation-1.0.371.dist-info → icsdatavalidation-1.0.415.dist-info}/RECORD +18 -16
  41. {icsDataValidation-1.0.371.dist-info → icsdatavalidation-1.0.415.dist-info}/WHEEL +1 -1
  42. {icsDataValidation-1.0.371.dist-info → icsdatavalidation-1.0.415.dist-info}/top_level.txt +0 -0
  43. icsDataValidation-1.0.371.dist-info/METADATA +0 -21
@@ -1,8 +1,9 @@
1
+ import snowflake.connector
1
2
  import logging
3
+ import pandas as pd
4
+
2
5
  from pathlib import PurePath
3
6
 
4
- import pandas as pd
5
- import snowflake.connector
6
7
  from cloe_util_snowflake_connector import connection_parameters
7
8
 
8
9
  from icsDataValidation.core.database_objects import DatabaseObject
@@ -61,7 +62,7 @@ class SnowflakeService:
61
62
  return f"Snowflake ERROR: {message}\nFailed statement:\n{statement}"
62
63
 
63
64
  @staticmethod
64
- def _get_in_clause(key_filters: list, numeric_columns: list, numeric_scale: int) -> str:
65
+ def _get_in_clause(key_filters: list, numeric_columns: list, numeric_scale: int, enclose_column_by_double_quotes: bool = False) -> str:
65
66
  """generates in_clause from list ready to expand the where clause, numeric values are rounded
66
67
 
67
68
  Args:
@@ -82,15 +83,18 @@ class SnowflakeService:
82
83
 
83
84
  in_clause_cols = " AND (("
84
85
  for key in key_filters.keys():
86
+ column_identifier = key.replace("'", "")
87
+ if enclose_column_by_double_quotes:
88
+ column_identifier = f'"{column_identifier}"'
85
89
  if key in numeric_columns:
86
- in_clause_cols += f"""ROUND({key.replace("'", "")},2)""" + ","
90
+ in_clause_cols += f"""ROUND({column_identifier}, {numeric_scale}),"""
87
91
  else:
88
- in_clause_cols += key.replace("'", "") + ","
92
+ in_clause_cols += f"{column_identifier},"
89
93
  in_clause_cols = in_clause_cols[:-1] + ")"
90
94
  in_clause = in_clause_cols + " in (" + in_clause_values + ")"
91
95
  return in_clause
92
96
 
93
- def _get_column_clause(self, column_list: list, columns_datatype: list, numeric_scale, key_columns) -> dict:
97
+ def _get_column_clause(self, column_list: list, columns_datatype: list, numeric_scale, key_columns, enclose_column_by_double_quotes: bool = False) -> dict:
94
98
  """
95
99
  Turns list of desired columns into a sql compatible string.
96
100
  Columns with a date or time data type are omitted.
@@ -108,20 +112,26 @@ class SnowflakeService:
108
112
  used_columns = []
109
113
  numeric_columns = []
110
114
  for column in column_list:
115
+
111
116
  column_datatype = next(x for x in columns_datatype if x["COLUMN_NAME"] == column)["DATA_TYPE"]
112
117
 
118
+ if enclose_column_by_double_quotes:
119
+ column_identifier = f'"{column}"'
120
+ else:
121
+ column_identifier = column
122
+
113
123
  if column in key_columns or column_datatype.lower() not in self.snowflake_datatype_mapping["date_and_time"]:
114
124
  if column_datatype.lower() in self.snowflake_datatype_mapping["numeric"]:
115
125
  if numeric_scale:
116
126
  column_intersecions_new.append(
117
- f"CAST(ROUND({column}, {numeric_scale}) as decimal(38,{numeric_scale})) as {column}"
127
+ f'CAST(ROUND({column_identifier}, {numeric_scale}) as decimal(38,{numeric_scale})) as {column_identifier}'
118
128
  )
119
129
  else:
120
- column_intersecions_new.append(f"{column} as {column}")
130
+ column_intersecions_new.append(f"{column_identifier} as {column_identifier}")
121
131
  used_columns.append(column)
122
132
  numeric_columns.append(column)
123
133
  elif column_datatype.lower() in self.snowflake_datatype_mapping["string"]:
124
- column_intersecions_new.append(f"{column} AS {column}")
134
+ column_intersecions_new.append(f"{column_identifier} AS {column_identifier}")
125
135
  used_columns.append(column)
126
136
  else:
127
137
  column_intersecions_new.append(column)
@@ -284,7 +294,12 @@ class SnowflakeService:
284
294
  return dict_colummns_datatype
285
295
 
286
296
  def get_count_distincts_from_object(
287
- self, object: DatabaseObject, column_intersections: list, where_clause: str = "", exclude_columns: list = []
297
+ self,
298
+ object: DatabaseObject,
299
+ column_intersections: list,
300
+ where_clause: str = "",
301
+ exclude_columns: list = [],
302
+ enclose_column_by_double_quotes: bool = False
288
303
  ) -> dict:
289
304
  """get distinct count for every column in a database object that is in column intersections list
290
305
 
@@ -305,8 +320,12 @@ class SnowflakeService:
305
320
  unions = ""
306
321
 
307
322
  for column in column_intersections:
323
+ if enclose_column_by_double_quotes:
324
+ column_identifier = f'"{column}"'
325
+ else:
326
+ column_identifier = column
308
327
  if column not in exclude_columns:
309
- unions += f" UNION SELECT '{column}' AS COLUMN_NAME, COUNT(DISTINCT {column}) AS COUNT_DISTINCT FROM {object.database}.{object.schema}.{object.name} {where_clause}"
328
+ unions += f' UNION SELECT {column_identifier} AS COLUMN_NAME, COUNT(DISTINCT {column_identifier}) AS COUNT_DISTINCT FROM {object.database}.{object.schema}.{object.name} {where_clause}'
310
329
 
311
330
  query_get_count_distincts_from_object = f"{unions[6:]} ORDER BY COUNT_DISTINCT;"
312
331
  error_list = []
@@ -346,6 +365,7 @@ class SnowflakeService:
346
365
  where_clause: str = "",
347
366
  exclude_columns: list = [],
348
367
  numeric_scale: int = None,
368
+ enclose_column_by_double_quotes: bool = False
349
369
  ) -> list[dict]:
350
370
  """creates checksums for given object in compliance with given conditions
351
371
 
@@ -371,29 +391,34 @@ class SnowflakeService:
371
391
  count_nulls = ""
372
392
 
373
393
  for column in column_intersections:
394
+ if enclose_column_by_double_quotes:
395
+ column_identifier = f'"{column}"'
396
+ else:
397
+ column_identifier = column
374
398
  column_datatype = next(x for x in dict_colummns_datatype if x["COLUMN_NAME"] == column)["DATA_TYPE"]
375
399
 
376
- count_nulls += f", SUM(CASE WHEN {column} IS NULL THEN 1 ELSE 0 END) AS COUNTNULLS_{column}"
400
+ count_nulls += f', SUM(CASE WHEN {column_identifier} IS NULL THEN 1 ELSE 0 END) AS "COUNTNULLS_{column}"'
377
401
 
378
402
  if column_datatype.lower() in self.snowflake_datatype_mapping["numeric"]:
379
403
  if numeric_scale:
380
404
  aggregates += (
381
- f", CAST(ROUND(SUM({column}), {numeric_scale}) AS DECIMAL(38, {numeric_scale})) AS sum_{column}"
405
+ f', CAST(ROUND(SUM({column_identifier}), {numeric_scale}) AS DECIMAL(38, {numeric_scale})) AS "SUM_{column}"'
382
406
  )
383
407
  else:
384
- aggregates += f", CAST(SUM({column}) AS DECIMAL(38)) AS sum_{column}"
408
+ aggregates += f', CAST(SUM({column_identifier}) AS DECIMAL(38)) AS "SUM_{column}"'
385
409
 
386
410
  elif (
387
411
  column_datatype.lower() in self.snowflake_datatype_mapping["string"]
388
412
  or column_datatype.lower() in self.snowflake_datatype_mapping["date_and_time"]
389
413
  ):
390
- aggregates += f", COUNT(DISTINCT LOWER({column})) AS countdistinct_{column}"
414
+ aggregates += f', COUNT(DISTINCT LOWER({column_identifier})) AS "COUNTDISTINCT_{column}"'
391
415
 
392
416
  elif column_datatype.lower() in self.snowflake_datatype_mapping["binary"]:
393
- aggregates += f", COUNT(DISTINCT LOWER(TRY_TO_NUMBER({column}::VARCHAR))) AS countdistinct_{column}"
417
+ aggregates += f', COUNT(DISTINCT LOWER(TRY_TO_NUMBER({column_identifier}::VARCHAR))) AS "COUNTDISTINCT_{column}"'
394
418
 
395
419
  elif column_datatype.lower() in self.snowflake_datatype_mapping["boolean"]:
396
- aggregates += f", MAX(SELECT COUNT(*) FROM {object.database}.{object.schema}.{object.name} WHERE {column} = true)::VARCHAR || '_' || MAX(SELECT COUNT(*) FROM {object.database}.{object.schema}.{object.name} WHERE {column} = false) :: VARCHAR AS aggregateboolean_{column}"
420
+ aggregates += f", MAX(SELECT COUNT(*) FROM {object.database}.{object.schema}.{object.name} WHERE {column_identifier} = true)::VARCHAR || '_' || MAX(SELECT COUNT(*) FROM {object.database}.{object.schema}.{object.name} WHERE {column_identifier} = false) :: VARCHAR AS \"AGGREGATEBOOLEAN_{column}\""
421
+
397
422
 
398
423
  # else: Additional Data Types: VARIANT OBJECT ARRAY GEOGRAPHY
399
424
 
@@ -450,6 +475,7 @@ class SnowflakeService:
450
475
  where_clause: str,
451
476
  exclude_columns: list,
452
477
  numeric_scale: int = None,
478
+ enclose_column_by_double_quotes: bool = False
453
479
  ) -> list[dict]:
454
480
  """execution of multiple aggregations at once
455
481
 
@@ -490,8 +516,12 @@ class SnowflakeService:
490
516
 
491
517
  try:
492
518
  for column in group_by_columns:
519
+ if enclose_column_by_double_quotes:
520
+ column_identifier = f'"{column}"'
521
+ else:
522
+ column_identifier = column
493
523
  if column in column_intersections and column not in exclude_columns:
494
- group_by_query_columns_string += f"{column} ,"
524
+ group_by_query_columns_string += f"{column_identifier} ,"
495
525
  grouping_columns_final.append(column)
496
526
 
497
527
  group_by_query_columns_string = group_by_query_columns_string[:-1]
@@ -502,27 +532,31 @@ class SnowflakeService:
502
532
  aggregates_min = ""
503
533
 
504
534
  for column in aggregation_columns:
535
+ if enclose_column_by_double_quotes:
536
+ column_identifier = f'"{column}"'
537
+ else:
538
+ column_identifier = column
505
539
  column_datatype = next(x for x in dict_colummns_datatype if x["COLUMN_NAME"] == column)["DATA_TYPE"]
506
540
 
507
541
  if column_datatype.lower() in self.snowflake_datatype_mapping["numeric"]:
508
542
  if numeric_scale:
509
- aggregates_min += f", CAST(ROUND(MIN({column}),{numeric_scale}) AS DECIMAL(38,{numeric_scale})) AS MIN_{column}, CAST(ROUND(max({column}),{numeric_scale}) AS DECIMAL(38,{numeric_scale})) AS MAX_{column}"
510
- aggregates += f", CAST(ROUND(SUM({column}), {numeric_scale}) AS DECIMAL(38,{numeric_scale})) AS SUM_{column}"
543
+ aggregates_min += f', CAST(ROUND(MIN({column_identifier}),{numeric_scale}) AS DECIMAL(38,{numeric_scale})) AS "MIN_{column}", CAST(ROUND(max({column_identifier}),{numeric_scale}) AS DECIMAL(38,{numeric_scale})) AS "MAX_{column}"'
544
+ aggregates += f', CAST(ROUND(SUM({column_identifier}), {numeric_scale}) AS DECIMAL(38,{numeric_scale})) AS "SUM_{column}"'
511
545
  else:
512
- aggregates_min += f", MIN({column}) AS MIN_{column}, MAX({column}) AS MAX_{column}"
513
- aggregates += f", SUM({column}) AS SUM_{column}"
546
+ aggregates_min += f', MIN({column_identifier}) AS "MIN_{column}", MAX({column_identifier}) AS "MAX_{column}"'
547
+ aggregates += f', SUM({column_identifier}) AS "SUM_{column}"'
514
548
 
515
549
  elif not only_numeric and (
516
550
  column_datatype.lower() in self.snowflake_datatype_mapping["string"]
517
551
  or column_datatype.lower() in self.snowflake_datatype_mapping["date_and_time"]
518
552
  ):
519
- aggregates += f", COUNT(DISTINCT LOWER({column})) AS COUNTDISTINCT_{column}"
553
+ aggregates += f', COUNT(DISTINCT LOWER({column_identifier})) AS "COUNTDISTINCT_{column}"'
520
554
 
521
555
  elif not only_numeric and column_datatype.lower() in self.snowflake_datatype_mapping["binary"]:
522
- aggregates += f", COUNT(DISTINCT LOWER(TRY_TO_NUMBER({column}::VARCHAR))) AS COUNTDISTINCT_{column}"
556
+ aggregates += f', COUNT(DISTINCT LOWER(TRY_TO_NUMBER({column_identifier}::VARCHAR))) AS "COUNTDISTINCT_{column}"'
523
557
 
524
558
  elif not only_numeric and column_datatype.lower() in self.snowflake_datatype_mapping["boolean"]:
525
- aggregates += f", MAX(SELECT COUNT(*) FROM {object.database}.{object.schema}.{object.name} WHERE {column} = true)::VARCHAR || '_' || MAX(SELECT COUNT(*) FROM {object.database}.{object.schema}.{object.name} WHERE {column} = false) :: VARCHAR AS AGGREGATEBOOLEAN_{column}"
559
+ aggregates += f", MAX(SELECT COUNT(*) FROM {object.database}.{object.schema}.{object.name} WHERE {column_identifier} = true)::VARCHAR || '_' || MAX(SELECT COUNT(*) FROM {object.database}.{object.schema}.{object.name} WHERE {column_identifier} = false) :: VARCHAR AS \"AGGREGATEBOOLEAN_{column}\""
526
560
 
527
561
  # else: Additional Data Types: VARIANT OBJECT ARRAY GEOGRAPHY
528
562
 
@@ -573,6 +607,7 @@ class SnowflakeService:
573
607
  intersection_columns_trgt_src: list,
574
608
  where_clause: str = "",
575
609
  exclude_columns: list = [],
610
+ enclose_column_by_double_quotes: bool = False
576
611
  ) -> pd.DataFrame:
577
612
  """creates pandas dataframes with all data from given object in given columns
578
613
 
@@ -586,14 +621,17 @@ class SnowflakeService:
586
621
 
587
622
  if self.snowflake_connection is None:
588
623
  self._connect_to_snowflake()
589
-
590
- intersection_columns_trgt_src_ = ", ".join(list(set(intersection_columns_trgt_src) - set(exclude_columns)))
624
+ if enclose_column_by_double_quotes:
625
+ intersection_columns_trgt_src_ = '", "'.join(list(set(intersection_columns_trgt_src) - set(exclude_columns)))
626
+ intersection_columns_trgt_src_ = f'"{intersection_columns_trgt_src_}"'
627
+ else:
628
+ intersection_columns_trgt_src_ = ", ".join(list(set(intersection_columns_trgt_src) - set(exclude_columns)))
591
629
 
592
630
  df_query = f"SELECT {intersection_columns_trgt_src_} FROM {object.database}.{object.schema}.{object.name} {where_clause};"
593
631
 
594
- src_pdf = self.execute_queries(df_query, True)
632
+ pdf = self.execute_queries(df_query, True)
595
633
 
596
- return src_pdf
634
+ return pdf
597
635
 
598
636
  def create_pandas_df_from_sample(
599
637
  self,
@@ -606,6 +644,7 @@ class SnowflakeService:
606
644
  dedicated_columns: list = [],
607
645
  sample_count: int = 10,
608
646
  numeric_scale: int = None,
647
+ enclose_column_by_double_quotes: bool = False
609
648
  ) -> list[dict]:
610
649
  if self.snowflake_connection is None:
611
650
  self._connect_to_snowflake()
@@ -633,28 +672,37 @@ class SnowflakeService:
633
672
  dict_colummns_datatype = self.get_data_types_from_object(object, column_intersections)
634
673
 
635
674
  if key_intersection != [] and is_dedicated:
636
- keys = str(key_intersection)[1:-1].replace("'", "")
675
+ if enclose_column_by_double_quotes:
676
+ keys = str(key_intersection)[1:-1].replace("'", "\"")
677
+ else:
678
+ keys = str(key_intersection)[1:-1].replace("'", "")
679
+
637
680
  column_clause, numeric_columns, used_columns = self._get_column_clause(
638
- dedicated_intersection, dict_colummns_datatype, numeric_scale, key_columns
681
+ dedicated_intersection, dict_colummns_datatype, numeric_scale, key_columns,
682
+ enclose_column_by_double_quotes
639
683
  )
640
684
  if (key_filters != {}) & (filter_intersection != []):
641
685
  values = list(key_filters.values())
642
686
  if values[0] != []:
643
- in_clause = self._get_in_clause(key_filters, numeric_columns, numeric_scale)
687
+ in_clause = self._get_in_clause(key_filters, numeric_columns, numeric_scale, enclose_column_by_double_quotes)
644
688
  else:
645
689
  in_clause = ""
646
690
  else:
647
691
  in_clause = ""
648
692
  sample_query = f"SELECT {column_clause} FROM {object.database}.{object.schema}.{object.name} SAMPLE ({sample_count} ROWS) {where_clause}{in_clause} ORDER BY {keys};"
649
693
  elif key_intersection != [] and not is_dedicated:
650
- keys = str(key_intersection)[1:-1].replace("'", "")
694
+ if enclose_column_by_double_quotes:
695
+ keys = str(key_intersection)[1:-1].replace("'", "\"")
696
+ else:
697
+ keys = str(key_intersection)[1:-1].replace("'", "")
651
698
  column_clause, numeric_columns, used_columns = self._get_column_clause(
652
- column_intersections, dict_colummns_datatype, numeric_scale, key_columns
699
+ column_intersections, dict_colummns_datatype, numeric_scale, key_columns,
700
+ enclose_column_by_double_quotes
653
701
  )
654
702
  if (key_filters != {}) & (filter_intersection != []):
655
703
  values = list(key_filters.values())
656
704
  if values[0] != []:
657
- in_clause = self._get_in_clause(key_filters, numeric_columns, numeric_scale)
705
+ in_clause = self._get_in_clause(key_filters, numeric_columns, numeric_scale, enclose_column_by_double_quotes)
658
706
  else:
659
707
  in_clause = ""
660
708
  else:
@@ -664,7 +712,8 @@ class SnowflakeService:
664
712
  column_intersections = list(set(column_intersections) - set(exclude_columns))
665
713
  column_intersections.sort()
666
714
  column_clause, numeric_columns, used_columns = self._get_column_clause(
667
- column_intersections, dict_colummns_datatype, numeric_scale, key_columns
715
+ column_intersections, dict_colummns_datatype, numeric_scale, key_columns,
716
+ enclose_column_by_double_quotes
668
717
  )
669
718
  sample_query = f"SELECT {column_clause} FROM {object.database}.{object.schema}.{object.name} SAMPLE ({sample_count} ROWS) {where_clause};"
670
719