PyPI - icsDataValidation - Versions diffs - 1.0.371__py3-none-any.whl → 1.0.415__py3-none-any.whl - Mend

icsDataValidation 1.0.371py3-none-any.whl → 1.0.415py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (43) hide show

icsDataValidation/services/database_services/snowflake_service.py CHANGED Viewed

@@ -1,8 +1,9 @@
+import snowflake.connector
 import logging
+import pandas as pd
 from pathlib import PurePath
-import pandas as pd
-import snowflake.connector
 from cloe_util_snowflake_connector import connection_parameters
 from icsDataValidation.core.database_objects import DatabaseObject
@@ -61,7 +62,7 @@ class SnowflakeService:
         return f"Snowflake ERROR: {message}\nFailed statement:\n{statement}"
     @staticmethod
-    def _get_in_clause(key_filters: list, numeric_columns: list, numeric_scale: int) -> str:
+    def _get_in_clause(key_filters: list, numeric_columns: list, numeric_scale: int, enclose_column_by_double_quotes: bool = False) -> str:
         """generates in_clause from list ready to expand the where clause, numeric values are rounded
         Args:
@@ -82,15 +83,18 @@ class SnowflakeService:
         in_clause_cols = " AND (("
         for key in key_filters.keys():
+            column_identifier = key.replace("'", "")
+            if enclose_column_by_double_quotes:
+                column_identifier = f'"{column_identifier}"'
             if key in numeric_columns:
-                in_clause_cols += f"""ROUND({key.replace("'", "")},2)""" + ","
+                in_clause_cols += f"""ROUND({column_identifier}, {numeric_scale}),"""
             else:
-                in_clause_cols += key.replace("'", "") + ","
+                in_clause_cols += f"{column_identifier},"
         in_clause_cols = in_clause_cols[:-1] + ")"
         in_clause = in_clause_cols + " in (" + in_clause_values + ")"
         return in_clause
-    def _get_column_clause(self, column_list: list, columns_datatype: list, numeric_scale, key_columns) -> dict:
+    def _get_column_clause(self, column_list: list, columns_datatype: list, numeric_scale, key_columns, enclose_column_by_double_quotes: bool = False) -> dict:
         """
         Turns list of desired columns into a sql compatible string.
         Columns with a date or time data type are omitted.
@@ -108,20 +112,26 @@ class SnowflakeService:
         used_columns = []
         numeric_columns = []
         for column in column_list:
             column_datatype = next(x for x in columns_datatype if x["COLUMN_NAME"] == column)["DATA_TYPE"]
+            if enclose_column_by_double_quotes:
+                column_identifier = f'"{column}"'
+            else:
+                column_identifier = column
             if column in key_columns or column_datatype.lower() not in self.snowflake_datatype_mapping["date_and_time"]:
                 if column_datatype.lower() in self.snowflake_datatype_mapping["numeric"]:
                     if numeric_scale:
                         column_intersecions_new.append(
-                            f"CAST(ROUND({column}, {numeric_scale}) as decimal(38,{numeric_scale})) as {column}"
+                            f'CAST(ROUND({column_identifier}, {numeric_scale}) as decimal(38,{numeric_scale})) as {column_identifier}'
                         )
                     else:
-                        column_intersecions_new.append(f"{column} as {column}")
+                        column_intersecions_new.append(f"{column_identifier} as {column_identifier}")
                     used_columns.append(column)
                     numeric_columns.append(column)
                 elif column_datatype.lower() in self.snowflake_datatype_mapping["string"]:
-                    column_intersecions_new.append(f"{column} AS {column}")
+                    column_intersecions_new.append(f"{column_identifier} AS {column_identifier}")
                     used_columns.append(column)
                 else:
                     column_intersecions_new.append(column)
@@ -284,7 +294,12 @@ class SnowflakeService:
         return dict_colummns_datatype
     def get_count_distincts_from_object(
-        self, object: DatabaseObject, column_intersections: list, where_clause: str = "", exclude_columns: list = []
+        self,
+        object: DatabaseObject,
+        column_intersections: list,
+        where_clause: str = "",
+        exclude_columns: list = [],
+        enclose_column_by_double_quotes: bool = False
     ) -> dict:
         """get distinct count for every column in a database object that is in column intersections list
@@ -305,8 +320,12 @@ class SnowflakeService:
         unions = ""
         for column in column_intersections:
+            if enclose_column_by_double_quotes:
+                column_identifier = f'"{column}"'
+            else:
+                column_identifier = column
             if column not in exclude_columns:
-                unions += f" UNION SELECT '{column}' AS COLUMN_NAME, COUNT(DISTINCT {column}) AS COUNT_DISTINCT FROM {object.database}.{object.schema}.{object.name} {where_clause}"
+                unions += f' UNION SELECT {column_identifier} AS COLUMN_NAME, COUNT(DISTINCT {column_identifier}) AS COUNT_DISTINCT FROM {object.database}.{object.schema}.{object.name} {where_clause}'
         query_get_count_distincts_from_object = f"{unions[6:]} ORDER BY COUNT_DISTINCT;"
         error_list = []
@@ -346,6 +365,7 @@ class SnowflakeService:
         where_clause: str = "",
         exclude_columns: list = [],
         numeric_scale: int = None,
+        enclose_column_by_double_quotes: bool = False
     ) -> list[dict]:
         """creates checksums for given object in compliance with given conditions
@@ -371,29 +391,34 @@ class SnowflakeService:
         count_nulls = ""
         for column in column_intersections:
+            if enclose_column_by_double_quotes:
+                column_identifier = f'"{column}"'
+            else:
+                column_identifier = column
             column_datatype = next(x for x in dict_colummns_datatype if x["COLUMN_NAME"] == column)["DATA_TYPE"]
-            count_nulls += f", SUM(CASE WHEN {column} IS NULL THEN 1 ELSE 0 END) AS COUNTNULLS_{column}"
+            count_nulls += f', SUM(CASE WHEN {column_identifier} IS NULL THEN 1 ELSE 0 END) AS "COUNTNULLS_{column}"'
             if column_datatype.lower() in self.snowflake_datatype_mapping["numeric"]:
                 if numeric_scale:
                     aggregates += (
-                        f", CAST(ROUND(SUM({column}), {numeric_scale}) AS DECIMAL(38, {numeric_scale})) AS sum_{column}"
+                        f', CAST(ROUND(SUM({column_identifier}), {numeric_scale}) AS DECIMAL(38, {numeric_scale})) AS "SUM_{column}"'
                     )
                 else:
-                    aggregates += f", CAST(SUM({column}) AS DECIMAL(38)) AS sum_{column}"
+                    aggregates += f', CAST(SUM({column_identifier}) AS DECIMAL(38)) AS "SUM_{column}"'
             elif (
                 column_datatype.lower() in self.snowflake_datatype_mapping["string"]
                 or column_datatype.lower() in self.snowflake_datatype_mapping["date_and_time"]
             ):
-                aggregates += f", COUNT(DISTINCT LOWER({column})) AS countdistinct_{column}"
+                aggregates += f', COUNT(DISTINCT LOWER({column_identifier})) AS "COUNTDISTINCT_{column}"'
             elif column_datatype.lower() in self.snowflake_datatype_mapping["binary"]:
-                aggregates += f", COUNT(DISTINCT LOWER(TRY_TO_NUMBER({column}::VARCHAR))) AS countdistinct_{column}"
+                aggregates += f', COUNT(DISTINCT LOWER(TRY_TO_NUMBER({column_identifier}::VARCHAR))) AS "COUNTDISTINCT_{column}"'
             elif column_datatype.lower() in self.snowflake_datatype_mapping["boolean"]:
-                aggregates += f", MAX(SELECT COUNT(*) FROM {object.database}.{object.schema}.{object.name} WHERE {column} = true)::VARCHAR || '_' || MAX(SELECT COUNT(*) FROM {object.database}.{object.schema}.{object.name} WHERE {column} = false) :: VARCHAR AS aggregateboolean_{column}"
+                aggregates += f", MAX(SELECT COUNT(*) FROM {object.database}.{object.schema}.{object.name} WHERE {column_identifier} = true)::VARCHAR || '_' || MAX(SELECT COUNT(*) FROM {object.database}.{object.schema}.{object.name} WHERE {column_identifier} = false) :: VARCHAR AS \"AGGREGATEBOOLEAN_{column}\""
             # else: Additional Data Types: VARIANT OBJECT ARRAY GEOGRAPHY
@@ -450,6 +475,7 @@ class SnowflakeService:
         where_clause: str,
         exclude_columns: list,
         numeric_scale: int = None,
+        enclose_column_by_double_quotes: bool = False
     ) -> list[dict]:
         """execution of multiple aggregations at once
@@ -490,8 +516,12 @@ class SnowflakeService:
         try:
             for column in group_by_columns:
+                if enclose_column_by_double_quotes:
+                    column_identifier = f'"{column}"'
+                else:
+                    column_identifier = column
                 if column in column_intersections and column not in exclude_columns:
-                    group_by_query_columns_string += f"{column} ,"
+                    group_by_query_columns_string += f"{column_identifier} ,"
                     grouping_columns_final.append(column)
             group_by_query_columns_string = group_by_query_columns_string[:-1]
@@ -502,27 +532,31 @@ class SnowflakeService:
             aggregates_min = ""
             for column in aggregation_columns:
+                if enclose_column_by_double_quotes:
+                    column_identifier = f'"{column}"'
+                else:
+                    column_identifier = column
                 column_datatype = next(x for x in dict_colummns_datatype if x["COLUMN_NAME"] == column)["DATA_TYPE"]
                 if column_datatype.lower() in self.snowflake_datatype_mapping["numeric"]:
                     if numeric_scale:
-                        aggregates_min += f", CAST(ROUND(MIN({column}),{numeric_scale}) AS DECIMAL(38,{numeric_scale})) AS MIN_{column}, CAST(ROUND(max({column}),{numeric_scale}) AS DECIMAL(38,{numeric_scale})) AS MAX_{column}"
-                        aggregates += f", CAST(ROUND(SUM({column}), {numeric_scale}) AS DECIMAL(38,{numeric_scale})) AS SUM_{column}"
+                        aggregates_min += f', CAST(ROUND(MIN({column_identifier}),{numeric_scale}) AS DECIMAL(38,{numeric_scale})) AS "MIN_{column}", CAST(ROUND(max({column_identifier}),{numeric_scale}) AS DECIMAL(38,{numeric_scale})) AS "MAX_{column}"'
+                        aggregates += f', CAST(ROUND(SUM({column_identifier}), {numeric_scale}) AS DECIMAL(38,{numeric_scale})) AS "SUM_{column}"'
                     else:
-                        aggregates_min += f", MIN({column}) AS MIN_{column}, MAX({column}) AS MAX_{column}"
-                        aggregates += f", SUM({column}) AS SUM_{column}"
+                        aggregates_min += f', MIN({column_identifier}) AS "MIN_{column}", MAX({column_identifier}) AS "MAX_{column}"'
+                        aggregates += f', SUM({column_identifier}) AS "SUM_{column}"'
                 elif not only_numeric and (
                     column_datatype.lower() in self.snowflake_datatype_mapping["string"]
                     or column_datatype.lower() in self.snowflake_datatype_mapping["date_and_time"]
                 ):
-                    aggregates += f", COUNT(DISTINCT LOWER({column})) AS COUNTDISTINCT_{column}"
+                    aggregates += f', COUNT(DISTINCT LOWER({column_identifier})) AS "COUNTDISTINCT_{column}"'
                 elif not only_numeric and column_datatype.lower() in self.snowflake_datatype_mapping["binary"]:
-                    aggregates += f", COUNT(DISTINCT LOWER(TRY_TO_NUMBER({column}::VARCHAR))) AS COUNTDISTINCT_{column}"
+                    aggregates += f', COUNT(DISTINCT LOWER(TRY_TO_NUMBER({column_identifier}::VARCHAR))) AS "COUNTDISTINCT_{column}"'
                 elif not only_numeric and column_datatype.lower() in self.snowflake_datatype_mapping["boolean"]:
-                    aggregates += f", MAX(SELECT COUNT(*) FROM {object.database}.{object.schema}.{object.name} WHERE {column} = true)::VARCHAR || '_' || MAX(SELECT COUNT(*) FROM {object.database}.{object.schema}.{object.name} WHERE {column} = false) :: VARCHAR AS AGGREGATEBOOLEAN_{column}"
+                    aggregates += f", MAX(SELECT COUNT(*) FROM {object.database}.{object.schema}.{object.name} WHERE {column_identifier} = true)::VARCHAR || '_' || MAX(SELECT COUNT(*) FROM {object.database}.{object.schema}.{object.name} WHERE {column_identifier} = false) :: VARCHAR AS \"AGGREGATEBOOLEAN_{column}\""
                 # else: Additional Data Types: VARIANT OBJECT ARRAY GEOGRAPHY
@@ -573,6 +607,7 @@ class SnowflakeService:
         intersection_columns_trgt_src: list,
         where_clause: str = "",
         exclude_columns: list = [],
+        enclose_column_by_double_quotes: bool = False
     ) -> pd.DataFrame:
         """creates pandas dataframes with all data from given object in given columns
@@ -586,14 +621,17 @@ class SnowflakeService:
         if self.snowflake_connection is None:
             self._connect_to_snowflake()
-        intersection_columns_trgt_src_ = ", ".join(list(set(intersection_columns_trgt_src) - set(exclude_columns)))
+        if enclose_column_by_double_quotes:
+            intersection_columns_trgt_src_ = '", "'.join(list(set(intersection_columns_trgt_src) - set(exclude_columns)))
+            intersection_columns_trgt_src_ = f'"{intersection_columns_trgt_src_}"'
+        else:
+            intersection_columns_trgt_src_ = ", ".join(list(set(intersection_columns_trgt_src) - set(exclude_columns)))
         df_query = f"SELECT {intersection_columns_trgt_src_} FROM {object.database}.{object.schema}.{object.name} {where_clause};"
-        src_pdf = self.execute_queries(df_query, True)
+        pdf = self.execute_queries(df_query, True)
-        return src_pdf
+        return pdf
     def create_pandas_df_from_sample(
         self,
@@ -606,6 +644,7 @@ class SnowflakeService:
         dedicated_columns: list = [],
         sample_count: int = 10,
         numeric_scale: int = None,
+        enclose_column_by_double_quotes: bool = False
     ) -> list[dict]:
         if self.snowflake_connection is None:
             self._connect_to_snowflake()
@@ -633,28 +672,37 @@ class SnowflakeService:
             dict_colummns_datatype = self.get_data_types_from_object(object, column_intersections)
         if key_intersection != [] and is_dedicated:
-            keys = str(key_intersection)[1:-1].replace("'", "")
+            if enclose_column_by_double_quotes:
+                keys = str(key_intersection)[1:-1].replace("'", "\"")
+            else:
+                keys = str(key_intersection)[1:-1].replace("'", "")
             column_clause, numeric_columns, used_columns = self._get_column_clause(
-                dedicated_intersection, dict_colummns_datatype, numeric_scale, key_columns
+                dedicated_intersection, dict_colummns_datatype, numeric_scale, key_columns,
+                enclose_column_by_double_quotes
             )
             if (key_filters != {}) & (filter_intersection != []):
                 values = list(key_filters.values())
                 if values[0] != []:
-                    in_clause = self._get_in_clause(key_filters, numeric_columns, numeric_scale)
+                    in_clause = self._get_in_clause(key_filters, numeric_columns, numeric_scale, enclose_column_by_double_quotes)
                 else:
                     in_clause = ""
             else:
                 in_clause = ""
             sample_query = f"SELECT {column_clause} FROM {object.database}.{object.schema}.{object.name} SAMPLE ({sample_count} ROWS) {where_clause}{in_clause} ORDER BY {keys};"
         elif key_intersection != [] and not is_dedicated:
-            keys = str(key_intersection)[1:-1].replace("'", "")
+            if enclose_column_by_double_quotes:
+                keys = str(key_intersection)[1:-1].replace("'", "\"")
+            else:
+                keys = str(key_intersection)[1:-1].replace("'", "")
             column_clause, numeric_columns, used_columns = self._get_column_clause(
-                column_intersections, dict_colummns_datatype, numeric_scale, key_columns
+                column_intersections, dict_colummns_datatype, numeric_scale, key_columns,
+                enclose_column_by_double_quotes
             )
             if (key_filters != {}) & (filter_intersection != []):
                 values = list(key_filters.values())
                 if values[0] != []:
-                    in_clause = self._get_in_clause(key_filters, numeric_columns, numeric_scale)
+                    in_clause = self._get_in_clause(key_filters, numeric_columns, numeric_scale, enclose_column_by_double_quotes)
                 else:
                     in_clause = ""
             else:
@@ -664,7 +712,8 @@ class SnowflakeService:
             column_intersections = list(set(column_intersections) - set(exclude_columns))
             column_intersections.sort()
             column_clause, numeric_columns, used_columns = self._get_column_clause(
-                column_intersections, dict_colummns_datatype, numeric_scale, key_columns
+                column_intersections, dict_colummns_datatype, numeric_scale, key_columns,
+                enclose_column_by_double_quotes
             )
             sample_query = f"SELECT {column_clause} FROM {object.database}.{object.schema}.{object.name} SAMPLE ({sample_count} ROWS) {where_clause};"

icsDataValidation 1.0.371__py3-none-any.whl → 1.0.415__py3-none-any.whl

icsDataValidation 1.0.371py3-none-any.whl → 1.0.415py3-none-any.whl