PyPI - icsDataValidation - Versions diffs - 1.0.430__py3-none-any.whl → 1.0.439__py3-none-any.whl - Mend

icsDataValidation 1.0.430py3-none-any.whl → 1.0.439py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (22) hide show

icsDataValidation/connection_setups/sqlserver_connection_setup.py CHANGED Viewed

@@ -1,8 +1,5 @@
 import os
-from dotenv import load_dotenv
-from pathlib import Path
 #########################################################################################
 #########################################################################################
@@ -15,6 +12,10 @@ def load_sqlserver_credentials(system_configs:dict,system_selection:str)->dict:
         "Password"  : os.getenv(system_configs[system_selection]["PASSWORD_NAME"]),
         "Driver"    : system_configs[system_selection]["DRIVER"],
         "Port"      : system_configs[system_selection]["PORT"],
+        "Encrypt"   : system_configs[system_selection]["Encrypt"],
+        "TrustServerCertificate"   : system_configs[system_selection]["TrustServerCertificate"]
     }
     return sqlserver_params

icsDataValidation/input_parameters/testing_tool_params.py CHANGED Viewed

@@ -56,7 +56,6 @@ class TestingToolParams:
     max_group_by_count_distinct: int            = int(os.environ.get('MAX_GROUP_BY_COUNT_DISTINCT','max_group_by_count_distinct env variable not found'))
     max_group_by_size: int                      = int(os.environ.get('MAX_GROUP_BY_SIZE','max_group_by_size env variable not found'))
     numeric_scale: int                          = int(os.environ.get('NUMERIC_SCALE','numeric_scale env variable not found'))
-    enclose_column_by_double_quotes: bool       = True if os.environ.get('ENCLOSE_COLUMN_BY_DOUBLE_QUOTES','enclose_column_by_double_quotes env variable not found') == 'True' else False
     branch_name: str                            = os.environ.get('BRANCH_NAME', 'branch_name env variable not found')
     source_branch:str                           = os.environ.get('BUILD_SOURCEBRANCH', 'build_sourcebranch env variable not found')
     azure_storage_connection_string: str        = os.environ.get('AZURE_STORAGE_CONNECTION_STRING','azure_storage_connection_string env variable not found')

icsDataValidation/services/database_services/snowflake_service.py CHANGED Viewed

@@ -1,9 +1,8 @@
-import snowflake.connector
 import logging
-import pandas as pd
 from pathlib import PurePath
+import pandas as pd
+import snowflake.connector
 from cloe_util_snowflake_connector import connection_parameters
 from icsDataValidation.core.database_objects import DatabaseObject
@@ -69,12 +68,16 @@ class SnowflakeService:
             key_filters (list): list of given expected values
             numeric_columns (list): list of all numeric columns
             numeric_scale (int): number of decimal places after rounding
+            enclose_column_by_double_quotes (bool): whether to enclose column names by double quotes
         Returns:
             str: in clause as string
         """
         values = list(key_filters.values())
         in_clause_values = "('"
+        if len(values) == 0:
+            return ""
         for j in range(len(values[0])):
             for value in values:
                 in_clause_values += str(value[j]) + "','"
@@ -104,6 +107,7 @@ class SnowflakeService:
             columns_datatype (list): datatypes of given columns
             numeric_scale (_type_): number of decimal places for numeric columns
             key_columns (_type_):list of columns of interest
+            enclose_column_by_double_quotes (bool): whether to enclose column names by double quotes
         Returns:
             dict: _description_
@@ -141,6 +145,109 @@ class SnowflakeService:
         column_clause = str(column_intersections)[1:-1].replace("'", "")
         return column_clause, numeric_columns, used_columns
+    def _get_checksum_statement(self,
+        object: DatabaseObject,
+        column_intersections: list,
+        where_clause: str = "",
+        exclude_columns: list = [],
+        numeric_scale: int = None,
+        enclose_column_by_double_quotes: bool = False,
+        bool_cast_before_sum: bool = False) -> str:
+        """
+        Creates checksum sql statement for given object in compliance with given conditions
+        object (DatabaseObject): table or view
+        column_intersections (list): columns that are used for checksums
+        where_clause (str, optional): Optional filter criteria given as sql-usable string
+        exclude_columns (list, optional): columns to exlude from calculation
+        numeric_scale (int, optional): number of decimal places for aggregations
+        enclose_column_by_double_quotes (bool, optional): whether to enclose column names by double quotes. Defaults to False.
+        bool_cast_before_sum (bool, optional): whether to cast before sum
+        Returns:
+            str: checksum sql statement
+        """
+        column_intersections = [f'{x}' for x in column_intersections if x not in exclude_columns]
+        logger.debug(f"Column Intersections: {column_intersections}")
+        dict_colummns_datatype = self.get_data_types_from_object(object, column_intersections)
+        aggregates = ""
+        for column in column_intersections:
+            if enclose_column_by_double_quotes:
+                column_identifier = f'"{column}"'
+            else:
+                column_identifier = column
+            column_datatype = next(x for x in dict_colummns_datatype if x["COLUMN_NAME"] == column)["DATA_TYPE"]
+            if column_datatype.lower() in self.snowflake_datatype_mapping["numeric"]:
+                if not bool_cast_before_sum:
+                    if numeric_scale:
+                        aggregates += (
+                            f', CAST(ROUND(SUM({column_identifier}), {numeric_scale}) AS DECIMAL(38, {numeric_scale})) AS "SUM_{column}"'
+                        )
+                    else:
+                        aggregates += f', CAST(SUM({column_identifier}) AS DECIMAL(38)) AS "SUM_{column}"'
+                else:
+                    if numeric_scale:
+                        aggregates += (
+                            f', ROUND(SUM(CAST({column_identifier} AS DECIMAL(38, {numeric_scale}))), {numeric_scale}) AS "SUM_{column}"'
+                        )
+                    else:
+                        aggregates += f', SUM(CAST({column_identifier} AS DECIMAL(38))) AS "SUM_{column}"'
+            elif (
+                column_datatype.lower() in self.snowflake_datatype_mapping["string"]
+                or column_datatype.lower() in self.snowflake_datatype_mapping["date_and_time"]
+            ):
+                aggregates += f', COUNT(DISTINCT LOWER({column_identifier})) AS "COUNTDISTINCT_{column}"'
+            elif column_datatype.lower() in self.snowflake_datatype_mapping["binary"]:
+                aggregates += f', COUNT(DISTINCT LOWER(TRY_CONVERT(VARCHAR,{column_identifier}))) AS "COUNTDISTINCT_{column}"'
+            elif column_datatype.lower() in self.snowflake_datatype_mapping["boolean"]:
+                aggregates += f''', CONCAT(CONCAT(CONVERT(VARCHAR,COUNT(CASE WHEN {column_identifier} = 1 THEN 1 ELSE NULL END)) , '_'),  CONVERT(VARCHAR, COUNT(CASE WHEN {column_identifier} = 0 THEN 1 ELSE NULL END))) AS "AGGREGATEBOOLEAN_{column}"'''
+            #else: Additional Data Types: image , sql_variant, uniqueidentifier, xml, cursor, table, column_datatype.lower() == 'bit' or
+        query_checksums = (
+            f"SELECT {aggregates[1:]} FROM {object.database}.{object.schema}.{object.name} {where_clause};"
+        )
+        return query_checksums
+    def _get_countnulls_statement(self,
+        object: DatabaseObject,
+        column_intersections: list,
+        where_clause: str = "",
+        exclude_columns: list = [],
+        enclose_column_by_double_quotes: bool = False):
+        """
+        Creates countnulls sql statement for given object in compliance with given conditions
+        object (DatabaseObject): table or view
+        column_intersections (list): columns that are used for checksums
+        where_clause (str, optional): Optional filter criteria given as sql-usable string
+        exclude_columns (list, optional): columns to exlude from calculation
+        enclose_column_by_double_quotes (bool, optional): whether to enclose column names by double quotes. Defaults to False.
+        Returns:
+            str: countnulls sql statement
+        """
+        column_intersections = [f"{x}" for x in column_intersections if x not in exclude_columns]
+        logger.debug(f"Column Intersections: {column_intersections}")
+        count_nulls = ""
+        for column in column_intersections:
+            if enclose_column_by_double_quotes:
+                column_identifier = f'"{column}"'
+            else:
+                column_identifier = column
+            count_nulls += f', SUM(CASE WHEN {column_identifier} IS NULL THEN 1 ELSE 0 END) AS "COUNTNULLS_{column}"'
+        query_countnulls = (
+            f"SELECT {count_nulls[1:]} FROM {object.database}.{object.schema}.{object.name} {where_clause};"
+        )
+        return query_countnulls
     def get_database_objects(
         self, database: str, schema: str = None, object_type_restriction: str = "include_all"
     ) -> dict:
@@ -308,6 +415,7 @@ class SnowflakeService:
             column_intersections (list): columns that are used for distinct count
             where_clause (str, optional): optional further filter. Defaults to "".
             exclude_columns (list, optional): columns to exclude from distinct count. Defaults to [].
+            enclose_column_by_double_quotes (bool): whether to enclose column names by double quotes. Defaults to False.
         Returns:
             dict: distinct counts for columns
@@ -383,83 +491,78 @@ class SnowflakeService:
         if self.snowflake_connection is None:
             self._connect_to_snowflake()
-        column_intersections = [f"{x.upper()}" for x in column_intersections if x not in exclude_columns]
-        logger.debug(f"Column Intersections: {column_intersections}")
-        dict_colummns_datatype = self.get_data_types_from_object(object, column_intersections)
-        aggregates = ""
-        count_nulls = ""
-        for column in column_intersections:
-            if enclose_column_by_double_quotes:
-                column_identifier = f'"{column}"'
-            else:
-                column_identifier = column
-            column_datatype = next(x for x in dict_colummns_datatype if x["COLUMN_NAME"] == column)["DATA_TYPE"]
-            count_nulls += f', SUM(CASE WHEN {column_identifier} IS NULL THEN 1 ELSE 0 END) AS "COUNTNULLS_{column}"'
-            if column_datatype.lower() in self.snowflake_datatype_mapping["numeric"]:
-                if numeric_scale:
-                    aggregates += (
-                        f', CAST(ROUND(SUM({column_identifier}), {numeric_scale}) AS DECIMAL(38, {numeric_scale})) AS "SUM_{column}"'
-                    )
-                else:
-                    aggregates += f', CAST(SUM({column_identifier}) AS DECIMAL(38)) AS "SUM_{column}"'
-            elif (
-                column_datatype.lower() in self.snowflake_datatype_mapping["string"]
-                or column_datatype.lower() in self.snowflake_datatype_mapping["date_and_time"]
-            ):
-                aggregates += f', COUNT(DISTINCT LOWER({column_identifier})) AS "COUNTDISTINCT_{column}"'
-            elif column_datatype.lower() in self.snowflake_datatype_mapping["binary"]:
-                aggregates += f', COUNT(DISTINCT LOWER(TRY_TO_NUMBER({column_identifier}::VARCHAR))) AS "COUNTDISTINCT_{column}"'
-            elif column_datatype.lower() in self.snowflake_datatype_mapping["boolean"]:
-                aggregates += f", MAX(SELECT COUNT(*) FROM {object.database}.{object.schema}.{object.name} WHERE {column_identifier} = true)::VARCHAR || '_' || MAX(SELECT COUNT(*) FROM {object.database}.{object.schema}.{object.name} WHERE {column_identifier} = false) :: VARCHAR AS \"AGGREGATEBOOLEAN_{column}\""
-            # else: Additional Data Types: VARIANT OBJECT ARRAY GEOGRAPHY
-        query_checksums = (
-            f"SELECT {aggregates[1:]} FROM {object.database}.{object.schema}.{object.name} {where_clause};"
+        ## get checksum query
+        query_checksums = self._get_checksum_statement(
+            object=object,
+            column_intersections=column_intersections,
+            where_clause=where_clause,
+            exclude_columns=exclude_columns,
+            numeric_scale=numeric_scale,
+            enclose_column_by_double_quotes=enclose_column_by_double_quotes
         )
-        query_countnulls = (
-            f"SELECT {count_nulls[1:]} FROM {object.database}.{object.schema}.{object.name} {where_clause};"
+        ## get countnulls query
+        query_countnulls = self._get_countnulls_statement(
+            object=object,
+            column_intersections=column_intersections,
+            where_clause=where_clause,
+            exclude_columns=exclude_columns,
+            enclose_column_by_double_quotes=enclose_column_by_double_quotes
         )
         error_list = []
         test_list = []
         aggregation_results = {}
+        countnulls_results = {}
         try:
             checksums_results = self.execute_queries([query_checksums, query_countnulls])
             aggregation_results = checksums_results[0][0]
             countnulls_results = checksums_results[1][0]
+        except Exception as err:
+            err_msg = ["ERROR", str(err).split("|||")[0], str(err).split("|||")[1]]
+            if 'Arithmetic overflow' in err_msg[2]:
+                # re-calculate queries with bool_cast_before_sum=True in case of error
+                query_checksums = self.create_checksum_statement(
+                    object=object,
+                    column_intersections=column_intersections,
+                    where_clause=where_clause,
+                    exclude_columns=exclude_columns,
+                    numeric_scale=numeric_scale,
+                    enclose_column_by_double_quotes=enclose_column_by_double_quotes,
+                    bool_cast_before_sum=True
+                )
+                try:
+                    # if overflow then try again with cast before sum for booleans
+                    checksums_results = self.execute_queries([query_checksums, query_countnulls])
+                    aggregation_results = checksums_results[0][0]
+                    countnulls_results = checksums_results[1][0]
+                except Exception as err:
+                    # handle error if it still occurs
+                    err_msg = ["ERROR", str(err).split("|||")[0], str(err).split("|||")[1]]
+                    error_list.append(err_msg)
+            else:
+                # handle error if it is not an overflow
+                error_list.append(err_msg)
+                checksums_results = None
+        # if error occured before this will be skipped as aggregation_results would be empty
+        for i in range(0, len(aggregation_results)):
+            if list(aggregation_results.values())[i] is None:
+                agg_result = 0
+            else:
+                agg_result = list(aggregation_results.values())[i]
-            for i in range(0, len(aggregation_results)):
-                if list(aggregation_results.values())[i] is None:
-                    agg_result = 0
-                else:
-                    agg_result = list(aggregation_results.values())[i]
-                if list(countnulls_results.values())[i] is None:
-                    cnt_result = 0
-                else:
-                    cnt_result = list(countnulls_results.values())[i]
+            if list(countnulls_results.values())[i] is None:
+                cnt_result = 0
+            else:
+                cnt_result = list(countnulls_results.values())[i]
-                test_list.append(
-                    [[item.split("_", 1)[0] for item in list(aggregation_results.keys())][i], agg_result, cnt_result]
-                )
+            test_list.append(
+                [[item.split("_", 1)[0] for item in list(aggregation_results.keys())][i], agg_result, cnt_result]
+            )
-        except Exception as err:
-            error_list.append(["ERROR", str(err).split("|||")[0], str(err).split("|||")[1]])
         checksums = dict(zip([item.split("_", 1)[1] for item in aggregation_results.keys()], test_list))
         checksums["TESTATM_ERRORS"] = error_list
@@ -542,7 +645,7 @@ class SnowflakeService:
                 if column_datatype.lower() in self.snowflake_datatype_mapping["numeric"]:
                     if numeric_scale:
-                        aggregates_min += f', CAST(ROUND(MIN({column_identifier}),{numeric_scale}) AS DECIMAL(38,{numeric_scale})) AS "MIN_{column}", CAST(ROUND(max({column_identifier}),{numeric_scale}) AS DECIMAL(38,{numeric_scale})) AS "MAX_{column}"'
+                        aggregates_min += f', CAST(ROUND(MIN({column_identifier}),{numeric_scale}) AS DECIMAL(38,{numeric_scale})) AS "MIN_{column}", CAST(ROUND(MAX({column_identifier}),{numeric_scale}) AS DECIMAL(38,{numeric_scale})) AS "MAX_{column}"'
                         aggregates += f', CAST(ROUND(SUM({column_identifier}), {numeric_scale}) AS DECIMAL(38,{numeric_scale})) AS "SUM_{column}"'
                     else:
                         aggregates_min += f', MIN({column_identifier}) AS "MIN_{column}", MAX({column_identifier}) AS "MAX_{column}"'

icsDataValidation 1.0.430__py3-none-any.whl → 1.0.439__py3-none-any.whl

icsDataValidation 1.0.430py3-none-any.whl → 1.0.439py3-none-any.whl