icsDataValidation 1.0.371__py3-none-any.whl → 1.0.415__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- icsDataValidation/configuration.py +0 -0
- icsDataValidation/connection_setups/__init__.py +0 -0
- icsDataValidation/connection_setups/azure_connection_setup.py +2 -1
- icsDataValidation/connection_setups/databricks_connection_setup.py +0 -0
- icsDataValidation/connection_setups/exasol_connection_setup.py +0 -0
- icsDataValidation/connection_setups/oracle_connection_setup.py +0 -0
- icsDataValidation/connection_setups/snowflake_connection_setup.py +0 -0
- icsDataValidation/connection_setups/sqlserver_connection_setup.py +20 -0
- icsDataValidation/connection_setups/teradata_connection_setup.py +0 -0
- icsDataValidation/core/__init__.py +0 -0
- icsDataValidation/core/database_objects.py +0 -0
- icsDataValidation/core/object_comparison.py +0 -0
- icsDataValidation/input_parameters/__init__.py +0 -0
- icsDataValidation/input_parameters/testing_tool_params.py +4 -3
- icsDataValidation/main.py +15 -11
- icsDataValidation/output_parameters/__init__.py +0 -0
- icsDataValidation/output_parameters/result_params.py +0 -0
- icsDataValidation/services/__init__.py +0 -0
- icsDataValidation/services/comparison_service.py +80 -76
- icsDataValidation/services/database_services/__init__.py +0 -0
- icsDataValidation/services/database_services/azure_service.py +69 -43
- icsDataValidation/services/database_services/databricks_hive_metastore_service.py +20 -7
- icsDataValidation/services/database_services/databricks_unity_catalog_service.py +20 -12
- icsDataValidation/services/database_services/exasol_service.py +26 -23
- icsDataValidation/services/database_services/oracle_service.py +64 -55
- icsDataValidation/services/database_services/snowflake_service.py +85 -36
- icsDataValidation/services/database_services/sqlserver_service.py +868 -0
- icsDataValidation/services/database_services/teradata_service.py +54 -37
- icsDataValidation/services/initialization_service.py +0 -0
- icsDataValidation/services/result_service.py +0 -0
- icsDataValidation/services/system_service.py +4 -0
- icsDataValidation/services/testset_service.py +0 -0
- icsDataValidation/utils/__init__.py +0 -0
- icsDataValidation/utils/file_util.py +0 -0
- icsDataValidation/utils/logger_util.py +0 -0
- icsDataValidation/utils/pandas_util.py +0 -0
- icsDataValidation/utils/parallelization_util.py +0 -0
- icsDataValidation/utils/sql_util.py +0 -0
- icsdatavalidation-1.0.415.dist-info/METADATA +298 -0
- {icsDataValidation-1.0.371.dist-info → icsdatavalidation-1.0.415.dist-info}/RECORD +18 -16
- {icsDataValidation-1.0.371.dist-info → icsdatavalidation-1.0.415.dist-info}/WHEEL +1 -1
- {icsDataValidation-1.0.371.dist-info → icsdatavalidation-1.0.415.dist-info}/top_level.txt +0 -0
- icsDataValidation-1.0.371.dist-info/METADATA +0 -21
|
@@ -1,8 +1,9 @@
|
|
|
1
|
+
import snowflake.connector
|
|
1
2
|
import logging
|
|
3
|
+
import pandas as pd
|
|
4
|
+
|
|
2
5
|
from pathlib import PurePath
|
|
3
6
|
|
|
4
|
-
import pandas as pd
|
|
5
|
-
import snowflake.connector
|
|
6
7
|
from cloe_util_snowflake_connector import connection_parameters
|
|
7
8
|
|
|
8
9
|
from icsDataValidation.core.database_objects import DatabaseObject
|
|
@@ -61,7 +62,7 @@ class SnowflakeService:
|
|
|
61
62
|
return f"Snowflake ERROR: {message}\nFailed statement:\n{statement}"
|
|
62
63
|
|
|
63
64
|
@staticmethod
|
|
64
|
-
def _get_in_clause(key_filters: list, numeric_columns: list, numeric_scale: int) -> str:
|
|
65
|
+
def _get_in_clause(key_filters: list, numeric_columns: list, numeric_scale: int, enclose_column_by_double_quotes: bool = False) -> str:
|
|
65
66
|
"""generates in_clause from list ready to expand the where clause, numeric values are rounded
|
|
66
67
|
|
|
67
68
|
Args:
|
|
@@ -82,15 +83,18 @@ class SnowflakeService:
|
|
|
82
83
|
|
|
83
84
|
in_clause_cols = " AND (("
|
|
84
85
|
for key in key_filters.keys():
|
|
86
|
+
column_identifier = key.replace("'", "")
|
|
87
|
+
if enclose_column_by_double_quotes:
|
|
88
|
+
column_identifier = f'"{column_identifier}"'
|
|
85
89
|
if key in numeric_columns:
|
|
86
|
-
in_clause_cols += f"""ROUND({
|
|
90
|
+
in_clause_cols += f"""ROUND({column_identifier}, {numeric_scale}),"""
|
|
87
91
|
else:
|
|
88
|
-
in_clause_cols +=
|
|
92
|
+
in_clause_cols += f"{column_identifier},"
|
|
89
93
|
in_clause_cols = in_clause_cols[:-1] + ")"
|
|
90
94
|
in_clause = in_clause_cols + " in (" + in_clause_values + ")"
|
|
91
95
|
return in_clause
|
|
92
96
|
|
|
93
|
-
def _get_column_clause(self, column_list: list, columns_datatype: list, numeric_scale, key_columns) -> dict:
|
|
97
|
+
def _get_column_clause(self, column_list: list, columns_datatype: list, numeric_scale, key_columns, enclose_column_by_double_quotes: bool = False) -> dict:
|
|
94
98
|
"""
|
|
95
99
|
Turns list of desired columns into a sql compatible string.
|
|
96
100
|
Columns with a date or time data type are omitted.
|
|
@@ -108,20 +112,26 @@ class SnowflakeService:
|
|
|
108
112
|
used_columns = []
|
|
109
113
|
numeric_columns = []
|
|
110
114
|
for column in column_list:
|
|
115
|
+
|
|
111
116
|
column_datatype = next(x for x in columns_datatype if x["COLUMN_NAME"] == column)["DATA_TYPE"]
|
|
112
117
|
|
|
118
|
+
if enclose_column_by_double_quotes:
|
|
119
|
+
column_identifier = f'"{column}"'
|
|
120
|
+
else:
|
|
121
|
+
column_identifier = column
|
|
122
|
+
|
|
113
123
|
if column in key_columns or column_datatype.lower() not in self.snowflake_datatype_mapping["date_and_time"]:
|
|
114
124
|
if column_datatype.lower() in self.snowflake_datatype_mapping["numeric"]:
|
|
115
125
|
if numeric_scale:
|
|
116
126
|
column_intersecions_new.append(
|
|
117
|
-
f
|
|
127
|
+
f'CAST(ROUND({column_identifier}, {numeric_scale}) as decimal(38,{numeric_scale})) as {column_identifier}'
|
|
118
128
|
)
|
|
119
129
|
else:
|
|
120
|
-
column_intersecions_new.append(f"{
|
|
130
|
+
column_intersecions_new.append(f"{column_identifier} as {column_identifier}")
|
|
121
131
|
used_columns.append(column)
|
|
122
132
|
numeric_columns.append(column)
|
|
123
133
|
elif column_datatype.lower() in self.snowflake_datatype_mapping["string"]:
|
|
124
|
-
column_intersecions_new.append(f"{
|
|
134
|
+
column_intersecions_new.append(f"{column_identifier} AS {column_identifier}")
|
|
125
135
|
used_columns.append(column)
|
|
126
136
|
else:
|
|
127
137
|
column_intersecions_new.append(column)
|
|
@@ -284,7 +294,12 @@ class SnowflakeService:
|
|
|
284
294
|
return dict_colummns_datatype
|
|
285
295
|
|
|
286
296
|
def get_count_distincts_from_object(
|
|
287
|
-
self,
|
|
297
|
+
self,
|
|
298
|
+
object: DatabaseObject,
|
|
299
|
+
column_intersections: list,
|
|
300
|
+
where_clause: str = "",
|
|
301
|
+
exclude_columns: list = [],
|
|
302
|
+
enclose_column_by_double_quotes: bool = False
|
|
288
303
|
) -> dict:
|
|
289
304
|
"""get distinct count for every column in a database object that is in column intersections list
|
|
290
305
|
|
|
@@ -305,8 +320,12 @@ class SnowflakeService:
|
|
|
305
320
|
unions = ""
|
|
306
321
|
|
|
307
322
|
for column in column_intersections:
|
|
323
|
+
if enclose_column_by_double_quotes:
|
|
324
|
+
column_identifier = f'"{column}"'
|
|
325
|
+
else:
|
|
326
|
+
column_identifier = column
|
|
308
327
|
if column not in exclude_columns:
|
|
309
|
-
unions += f
|
|
328
|
+
unions += f' UNION SELECT {column_identifier} AS COLUMN_NAME, COUNT(DISTINCT {column_identifier}) AS COUNT_DISTINCT FROM {object.database}.{object.schema}.{object.name} {where_clause}'
|
|
310
329
|
|
|
311
330
|
query_get_count_distincts_from_object = f"{unions[6:]} ORDER BY COUNT_DISTINCT;"
|
|
312
331
|
error_list = []
|
|
@@ -346,6 +365,7 @@ class SnowflakeService:
|
|
|
346
365
|
where_clause: str = "",
|
|
347
366
|
exclude_columns: list = [],
|
|
348
367
|
numeric_scale: int = None,
|
|
368
|
+
enclose_column_by_double_quotes: bool = False
|
|
349
369
|
) -> list[dict]:
|
|
350
370
|
"""creates checksums for given object in compliance with given conditions
|
|
351
371
|
|
|
@@ -371,29 +391,34 @@ class SnowflakeService:
|
|
|
371
391
|
count_nulls = ""
|
|
372
392
|
|
|
373
393
|
for column in column_intersections:
|
|
394
|
+
if enclose_column_by_double_quotes:
|
|
395
|
+
column_identifier = f'"{column}"'
|
|
396
|
+
else:
|
|
397
|
+
column_identifier = column
|
|
374
398
|
column_datatype = next(x for x in dict_colummns_datatype if x["COLUMN_NAME"] == column)["DATA_TYPE"]
|
|
375
399
|
|
|
376
|
-
count_nulls += f
|
|
400
|
+
count_nulls += f', SUM(CASE WHEN {column_identifier} IS NULL THEN 1 ELSE 0 END) AS "COUNTNULLS_{column}"'
|
|
377
401
|
|
|
378
402
|
if column_datatype.lower() in self.snowflake_datatype_mapping["numeric"]:
|
|
379
403
|
if numeric_scale:
|
|
380
404
|
aggregates += (
|
|
381
|
-
f
|
|
405
|
+
f', CAST(ROUND(SUM({column_identifier}), {numeric_scale}) AS DECIMAL(38, {numeric_scale})) AS "SUM_{column}"'
|
|
382
406
|
)
|
|
383
407
|
else:
|
|
384
|
-
aggregates += f
|
|
408
|
+
aggregates += f', CAST(SUM({column_identifier}) AS DECIMAL(38)) AS "SUM_{column}"'
|
|
385
409
|
|
|
386
410
|
elif (
|
|
387
411
|
column_datatype.lower() in self.snowflake_datatype_mapping["string"]
|
|
388
412
|
or column_datatype.lower() in self.snowflake_datatype_mapping["date_and_time"]
|
|
389
413
|
):
|
|
390
|
-
aggregates += f
|
|
414
|
+
aggregates += f', COUNT(DISTINCT LOWER({column_identifier})) AS "COUNTDISTINCT_{column}"'
|
|
391
415
|
|
|
392
416
|
elif column_datatype.lower() in self.snowflake_datatype_mapping["binary"]:
|
|
393
|
-
aggregates += f
|
|
417
|
+
aggregates += f', COUNT(DISTINCT LOWER(TRY_TO_NUMBER({column_identifier}::VARCHAR))) AS "COUNTDISTINCT_{column}"'
|
|
394
418
|
|
|
395
419
|
elif column_datatype.lower() in self.snowflake_datatype_mapping["boolean"]:
|
|
396
|
-
aggregates += f", MAX(SELECT COUNT(*) FROM {object.database}.{object.schema}.{object.name} WHERE {
|
|
420
|
+
aggregates += f", MAX(SELECT COUNT(*) FROM {object.database}.{object.schema}.{object.name} WHERE {column_identifier} = true)::VARCHAR || '_' || MAX(SELECT COUNT(*) FROM {object.database}.{object.schema}.{object.name} WHERE {column_identifier} = false) :: VARCHAR AS \"AGGREGATEBOOLEAN_{column}\""
|
|
421
|
+
|
|
397
422
|
|
|
398
423
|
# else: Additional Data Types: VARIANT OBJECT ARRAY GEOGRAPHY
|
|
399
424
|
|
|
@@ -450,6 +475,7 @@ class SnowflakeService:
|
|
|
450
475
|
where_clause: str,
|
|
451
476
|
exclude_columns: list,
|
|
452
477
|
numeric_scale: int = None,
|
|
478
|
+
enclose_column_by_double_quotes: bool = False
|
|
453
479
|
) -> list[dict]:
|
|
454
480
|
"""execution of multiple aggregations at once
|
|
455
481
|
|
|
@@ -490,8 +516,12 @@ class SnowflakeService:
|
|
|
490
516
|
|
|
491
517
|
try:
|
|
492
518
|
for column in group_by_columns:
|
|
519
|
+
if enclose_column_by_double_quotes:
|
|
520
|
+
column_identifier = f'"{column}"'
|
|
521
|
+
else:
|
|
522
|
+
column_identifier = column
|
|
493
523
|
if column in column_intersections and column not in exclude_columns:
|
|
494
|
-
group_by_query_columns_string += f"{
|
|
524
|
+
group_by_query_columns_string += f"{column_identifier} ,"
|
|
495
525
|
grouping_columns_final.append(column)
|
|
496
526
|
|
|
497
527
|
group_by_query_columns_string = group_by_query_columns_string[:-1]
|
|
@@ -502,27 +532,31 @@ class SnowflakeService:
|
|
|
502
532
|
aggregates_min = ""
|
|
503
533
|
|
|
504
534
|
for column in aggregation_columns:
|
|
535
|
+
if enclose_column_by_double_quotes:
|
|
536
|
+
column_identifier = f'"{column}"'
|
|
537
|
+
else:
|
|
538
|
+
column_identifier = column
|
|
505
539
|
column_datatype = next(x for x in dict_colummns_datatype if x["COLUMN_NAME"] == column)["DATA_TYPE"]
|
|
506
540
|
|
|
507
541
|
if column_datatype.lower() in self.snowflake_datatype_mapping["numeric"]:
|
|
508
542
|
if numeric_scale:
|
|
509
|
-
aggregates_min += f
|
|
510
|
-
aggregates += f
|
|
543
|
+
aggregates_min += f', CAST(ROUND(MIN({column_identifier}),{numeric_scale}) AS DECIMAL(38,{numeric_scale})) AS "MIN_{column}", CAST(ROUND(max({column_identifier}),{numeric_scale}) AS DECIMAL(38,{numeric_scale})) AS "MAX_{column}"'
|
|
544
|
+
aggregates += f', CAST(ROUND(SUM({column_identifier}), {numeric_scale}) AS DECIMAL(38,{numeric_scale})) AS "SUM_{column}"'
|
|
511
545
|
else:
|
|
512
|
-
aggregates_min += f
|
|
513
|
-
aggregates += f
|
|
546
|
+
aggregates_min += f', MIN({column_identifier}) AS "MIN_{column}", MAX({column_identifier}) AS "MAX_{column}"'
|
|
547
|
+
aggregates += f', SUM({column_identifier}) AS "SUM_{column}"'
|
|
514
548
|
|
|
515
549
|
elif not only_numeric and (
|
|
516
550
|
column_datatype.lower() in self.snowflake_datatype_mapping["string"]
|
|
517
551
|
or column_datatype.lower() in self.snowflake_datatype_mapping["date_and_time"]
|
|
518
552
|
):
|
|
519
|
-
aggregates += f
|
|
553
|
+
aggregates += f', COUNT(DISTINCT LOWER({column_identifier})) AS "COUNTDISTINCT_{column}"'
|
|
520
554
|
|
|
521
555
|
elif not only_numeric and column_datatype.lower() in self.snowflake_datatype_mapping["binary"]:
|
|
522
|
-
aggregates += f
|
|
556
|
+
aggregates += f', COUNT(DISTINCT LOWER(TRY_TO_NUMBER({column_identifier}::VARCHAR))) AS "COUNTDISTINCT_{column}"'
|
|
523
557
|
|
|
524
558
|
elif not only_numeric and column_datatype.lower() in self.snowflake_datatype_mapping["boolean"]:
|
|
525
|
-
aggregates += f", MAX(SELECT COUNT(*) FROM {object.database}.{object.schema}.{object.name} WHERE {
|
|
559
|
+
aggregates += f", MAX(SELECT COUNT(*) FROM {object.database}.{object.schema}.{object.name} WHERE {column_identifier} = true)::VARCHAR || '_' || MAX(SELECT COUNT(*) FROM {object.database}.{object.schema}.{object.name} WHERE {column_identifier} = false) :: VARCHAR AS \"AGGREGATEBOOLEAN_{column}\""
|
|
526
560
|
|
|
527
561
|
# else: Additional Data Types: VARIANT OBJECT ARRAY GEOGRAPHY
|
|
528
562
|
|
|
@@ -573,6 +607,7 @@ class SnowflakeService:
|
|
|
573
607
|
intersection_columns_trgt_src: list,
|
|
574
608
|
where_clause: str = "",
|
|
575
609
|
exclude_columns: list = [],
|
|
610
|
+
enclose_column_by_double_quotes: bool = False
|
|
576
611
|
) -> pd.DataFrame:
|
|
577
612
|
"""creates pandas dataframes with all data from given object in given columns
|
|
578
613
|
|
|
@@ -586,14 +621,17 @@ class SnowflakeService:
|
|
|
586
621
|
|
|
587
622
|
if self.snowflake_connection is None:
|
|
588
623
|
self._connect_to_snowflake()
|
|
589
|
-
|
|
590
|
-
|
|
624
|
+
if enclose_column_by_double_quotes:
|
|
625
|
+
intersection_columns_trgt_src_ = '", "'.join(list(set(intersection_columns_trgt_src) - set(exclude_columns)))
|
|
626
|
+
intersection_columns_trgt_src_ = f'"{intersection_columns_trgt_src_}"'
|
|
627
|
+
else:
|
|
628
|
+
intersection_columns_trgt_src_ = ", ".join(list(set(intersection_columns_trgt_src) - set(exclude_columns)))
|
|
591
629
|
|
|
592
630
|
df_query = f"SELECT {intersection_columns_trgt_src_} FROM {object.database}.{object.schema}.{object.name} {where_clause};"
|
|
593
631
|
|
|
594
|
-
|
|
632
|
+
pdf = self.execute_queries(df_query, True)
|
|
595
633
|
|
|
596
|
-
return
|
|
634
|
+
return pdf
|
|
597
635
|
|
|
598
636
|
def create_pandas_df_from_sample(
|
|
599
637
|
self,
|
|
@@ -606,6 +644,7 @@ class SnowflakeService:
|
|
|
606
644
|
dedicated_columns: list = [],
|
|
607
645
|
sample_count: int = 10,
|
|
608
646
|
numeric_scale: int = None,
|
|
647
|
+
enclose_column_by_double_quotes: bool = False
|
|
609
648
|
) -> list[dict]:
|
|
610
649
|
if self.snowflake_connection is None:
|
|
611
650
|
self._connect_to_snowflake()
|
|
@@ -633,28 +672,37 @@ class SnowflakeService:
|
|
|
633
672
|
dict_colummns_datatype = self.get_data_types_from_object(object, column_intersections)
|
|
634
673
|
|
|
635
674
|
if key_intersection != [] and is_dedicated:
|
|
636
|
-
|
|
675
|
+
if enclose_column_by_double_quotes:
|
|
676
|
+
keys = str(key_intersection)[1:-1].replace("'", "\"")
|
|
677
|
+
else:
|
|
678
|
+
keys = str(key_intersection)[1:-1].replace("'", "")
|
|
679
|
+
|
|
637
680
|
column_clause, numeric_columns, used_columns = self._get_column_clause(
|
|
638
|
-
dedicated_intersection, dict_colummns_datatype, numeric_scale, key_columns
|
|
681
|
+
dedicated_intersection, dict_colummns_datatype, numeric_scale, key_columns,
|
|
682
|
+
enclose_column_by_double_quotes
|
|
639
683
|
)
|
|
640
684
|
if (key_filters != {}) & (filter_intersection != []):
|
|
641
685
|
values = list(key_filters.values())
|
|
642
686
|
if values[0] != []:
|
|
643
|
-
in_clause = self._get_in_clause(key_filters, numeric_columns, numeric_scale)
|
|
687
|
+
in_clause = self._get_in_clause(key_filters, numeric_columns, numeric_scale, enclose_column_by_double_quotes)
|
|
644
688
|
else:
|
|
645
689
|
in_clause = ""
|
|
646
690
|
else:
|
|
647
691
|
in_clause = ""
|
|
648
692
|
sample_query = f"SELECT {column_clause} FROM {object.database}.{object.schema}.{object.name} SAMPLE ({sample_count} ROWS) {where_clause}{in_clause} ORDER BY {keys};"
|
|
649
693
|
elif key_intersection != [] and not is_dedicated:
|
|
650
|
-
|
|
694
|
+
if enclose_column_by_double_quotes:
|
|
695
|
+
keys = str(key_intersection)[1:-1].replace("'", "\"")
|
|
696
|
+
else:
|
|
697
|
+
keys = str(key_intersection)[1:-1].replace("'", "")
|
|
651
698
|
column_clause, numeric_columns, used_columns = self._get_column_clause(
|
|
652
|
-
column_intersections, dict_colummns_datatype, numeric_scale, key_columns
|
|
699
|
+
column_intersections, dict_colummns_datatype, numeric_scale, key_columns,
|
|
700
|
+
enclose_column_by_double_quotes
|
|
653
701
|
)
|
|
654
702
|
if (key_filters != {}) & (filter_intersection != []):
|
|
655
703
|
values = list(key_filters.values())
|
|
656
704
|
if values[0] != []:
|
|
657
|
-
in_clause = self._get_in_clause(key_filters, numeric_columns, numeric_scale)
|
|
705
|
+
in_clause = self._get_in_clause(key_filters, numeric_columns, numeric_scale, enclose_column_by_double_quotes)
|
|
658
706
|
else:
|
|
659
707
|
in_clause = ""
|
|
660
708
|
else:
|
|
@@ -664,7 +712,8 @@ class SnowflakeService:
|
|
|
664
712
|
column_intersections = list(set(column_intersections) - set(exclude_columns))
|
|
665
713
|
column_intersections.sort()
|
|
666
714
|
column_clause, numeric_columns, used_columns = self._get_column_clause(
|
|
667
|
-
column_intersections, dict_colummns_datatype, numeric_scale, key_columns
|
|
715
|
+
column_intersections, dict_colummns_datatype, numeric_scale, key_columns,
|
|
716
|
+
enclose_column_by_double_quotes
|
|
668
717
|
)
|
|
669
718
|
sample_query = f"SELECT {column_clause} FROM {object.database}.{object.schema}.{object.name} SAMPLE ({sample_count} ROWS) {where_clause};"
|
|
670
719
|
|