PyPI - databricks-labs-lakebridge - Versions diffs - 0.10.6__py3-none-any.whl → 0.10.8__py3-none-any.whl - Mend

databricks-labs-lakebridge 0.10.6py3-none-any.whl → 0.10.8py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (46) hide show

databricks/labs/lakebridge/reconcile/compare.py CHANGED Viewed

@@ -3,6 +3,7 @@ from functools import reduce
 from pyspark.sql import DataFrame, SparkSession
 from pyspark.sql.functions import col, expr, lit
+from databricks.labs.lakebridge.reconcile.connectors.dialect_utils import DialectUtils
 from databricks.labs.lakebridge.reconcile.exception import ColumnMismatchException
 from databricks.labs.lakebridge.reconcile.recon_capture import (
     ReconIntermediatePersist,
@@ -22,7 +23,7 @@ _HASH_COLUMN_NAME = "hash_value_recon"
 _SAMPLE_ROWS = 50
-def raise_column_mismatch_exception(msg: str, source_missing: list[str], target_missing: list[str]) -> Exception:
+def _raise_column_mismatch_exception(msg: str, source_missing: list[str], target_missing: list[str]) -> Exception:
     error_msg = (
         f"{msg}\n"
         f"columns missing in source: {','.join(source_missing) if source_missing else None}\n"
@@ -33,12 +34,25 @@ def raise_column_mismatch_exception(msg: str, source_missing: list[str], target_
 def _generate_join_condition(source_alias, target_alias, key_columns):
     conditions = [
-        col(f"{source_alias}.{key_column}").eqNullSafe(col(f"{target_alias}.{key_column}"))
+        col(f"{source_alias}.{DialectUtils.ansi_normalize_identifier(key_column)}").eqNullSafe(
+            col(f"{target_alias}.{DialectUtils.ansi_normalize_identifier(key_column)}")
+        )
         for key_column in key_columns
     ]
     return reduce(lambda a, b: a & b, conditions)
+def _build_column_selector(table_name, column_name):
+    alias = DialectUtils.ansi_normalize_identifier(f"{table_name}_{DialectUtils.unnormalize_identifier(column_name)}")
+    return f'{table_name}.{DialectUtils.ansi_normalize_identifier(column_name)} as {alias}'
+def _build_mismatch_column(table, column):
+    return col(DialectUtils.ansi_normalize_identifier(column)).alias(
+        DialectUtils.unnormalize_identifier(column.replace(f'{table}_', '').lower())
+    )
 def reconcile_data(
     source: DataFrame,
     target: DataFrame,
@@ -59,14 +73,14 @@ def reconcile_data(
             how="full",
         )
         .selectExpr(
-            *[f'{source_alias}.{col_name} as {source_alias}_{col_name}' for col_name in source.columns],
-            *[f'{target_alias}.{col_name} as {target_alias}_{col_name}' for col_name in target.columns],
+            *[f'{_build_column_selector(source_alias, col_name)}' for col_name in source.columns],
+            *[f'{_build_column_selector(target_alias, col_name)}' for col_name in target.columns],
         )
     )
     # Write unmatched df to volume
     df = ReconIntermediatePersist(spark, path).write_and_read_unmatched_df_with_volumes(df)
-    logger.warning(f"Unmatched data is written to {path} successfully")
+    logger.warning(f"Unmatched data was written to {path} successfully")
     mismatch = _get_mismatch_data(df, source_alias, target_alias) if report_type in {"all", "data"} else None
@@ -74,24 +88,24 @@ def reconcile_data(
         df.filter(col(f"{source_alias}_{_HASH_COLUMN_NAME}").isNull())
         .select(
             *[
-                col(col_name).alias(col_name.replace(f'{target_alias}_', '').lower())
+                _build_mismatch_column(target_alias, col_name)
                 for col_name in df.columns
                 if col_name.startswith(f'{target_alias}_')
             ]
         )
-        .drop(_HASH_COLUMN_NAME)
+        .drop(f"{_HASH_COLUMN_NAME}")
     )
     missing_in_tgt = (
         df.filter(col(f"{target_alias}_{_HASH_COLUMN_NAME}").isNull())
         .select(
             *[
-                col(col_name).alias(col_name.replace(f'{source_alias}_', '').lower())
+                _build_mismatch_column(source_alias, col_name)
                 for col_name in df.columns
                 if col_name.startswith(f'{source_alias}_')
             ]
         )
-        .drop(_HASH_COLUMN_NAME)
+        .drop(f"{_HASH_COLUMN_NAME}")
     )
     mismatch_count = 0
     if mismatch:
@@ -123,23 +137,27 @@ def _get_mismatch_data(df: DataFrame, src_alias: str, tgt_alias: str) -> DataFra
         .filter(col("hash_match") == lit(False))
         .select(
             *[
-                col(col_name).alias(col_name.replace(f'{src_alias}_', '').lower())
+                _build_mismatch_column(src_alias, col_name)
                 for col_name in df.columns
                 if col_name.startswith(f'{src_alias}_')
             ]
         )
-        .drop(_HASH_COLUMN_NAME)
+        .drop(f"{_HASH_COLUMN_NAME}")
     )
-def _convert_columns_to_lowercase(df: DataFrame) -> DataFrame:
-    lowercased_columns = [col(column).alias(column.lower()) for column in df.columns]
-    return df.select(*lowercased_columns)
+def _build_capture_df(df: DataFrame) -> DataFrame:
+    columns = [
+        col(DialectUtils.ansi_normalize_identifier(column)).alias(DialectUtils.unnormalize_identifier(column))
+        for column in df.columns
+    ]
+    return df.select(*columns)
 def capture_mismatch_data_and_columns(source: DataFrame, target: DataFrame, key_columns: list[str]) -> MismatchOutput:
-    source_df = _convert_columns_to_lowercase(source)
-    target_df = _convert_columns_to_lowercase(target)
+    source_df = _build_capture_df(source)
+    target_df = _build_capture_df(target)
+    unnormalized_key_columns = [DialectUtils.unnormalize_identifier(column) for column in key_columns]
     source_columns = source_df.columns
     target_columns = target_df.columns
@@ -148,10 +166,10 @@ def capture_mismatch_data_and_columns(source: DataFrame, target: DataFrame, key_
         message = "source and target should have same columns for capturing the mismatch data"
         source_missing = [column for column in target_columns if column not in source_columns]
         target_missing = [column for column in source_columns if column not in target_columns]
-        raise raise_column_mismatch_exception(message, source_missing, target_missing)
+        raise _raise_column_mismatch_exception(message, source_missing, target_missing)
-    check_columns = [column for column in source_columns if column not in key_columns]
-    mismatch_df = _get_mismatch_df(source_df, target_df, key_columns, check_columns)
+    check_columns = [column for column in source_columns if column not in unnormalized_key_columns]
+    mismatch_df = _get_mismatch_df(source_df, target_df, unnormalized_key_columns, check_columns)
     mismatch_columns = _get_mismatch_columns(mismatch_df, check_columns)
     return MismatchOutput(mismatch_df, mismatch_columns)
@@ -167,31 +185,50 @@ def _get_mismatch_columns(df: DataFrame, columns: list[str]):
     return mismatch_columns
+def _normalize_mismatch_df_col(column, suffix):
+    unnormalized = DialectUtils.unnormalize_identifier(column) + suffix
+    return DialectUtils.ansi_normalize_identifier(unnormalized)
+def _unnormalize_mismatch_df_col(column, suffix):
+    unnormalized = DialectUtils.unnormalize_identifier(column) + suffix
+    return unnormalized
 def _get_mismatch_df(source: DataFrame, target: DataFrame, key_columns: list[str], column_list: list[str]):
-    source_aliased = [col('base.' + column).alias(column + '_base') for column in column_list]
-    target_aliased = [col('compare.' + column).alias(column + '_compare') for column in column_list]
+    source_aliased = [
+        col('base.' + DialectUtils.ansi_normalize_identifier(column)).alias(
+            _unnormalize_mismatch_df_col(column, '_base')
+        )
+        for column in column_list
+    ]
+    target_aliased = [
+        col('compare.' + DialectUtils.ansi_normalize_identifier(column)).alias(
+            _unnormalize_mismatch_df_col(column, '_compare')
+        )
+        for column in column_list
+    ]
-    match_expr = [expr(f"{column}_base=={column}_compare").alias(column + "_match") for column in column_list]
-    key_cols = [col(column) for column in key_columns]
+    match_expr = [
+        expr(f"{_normalize_mismatch_df_col(column,'_base')}=={_normalize_mismatch_df_col(column,'_compare')}").alias(
+            _unnormalize_mismatch_df_col(column, '_match')
+        )
+        for column in column_list
+    ]
+    key_cols = [col(DialectUtils.ansi_normalize_identifier(column)) for column in key_columns]
     select_expr = key_cols + source_aliased + target_aliased + match_expr
-    filter_columns = " and ".join([column + "_match" for column in column_list])
-    filter_expr = ~expr(filter_columns)
     logger.info(f"KEY COLUMNS: {key_columns}")
-    logger.info(f"FILTER COLUMNS: {filter_expr}")
     logger.info(f"SELECT COLUMNS: {select_expr}")
     mismatch_df = (
         source.alias('base').join(other=target.alias('compare'), on=key_columns, how="inner").select(*select_expr)
     )
-    compare_columns = [column for column in mismatch_df.columns if column not in key_columns]
-    return mismatch_df.select(*key_columns + sorted(compare_columns))
-def alias_column_str(alias: str, columns: list[str]) -> list[str]:
-    return [f"{alias}.{column}" for column in columns]
+    compare_columns = [
+        DialectUtils.ansi_normalize_identifier(column) for column in mismatch_df.columns if column not in key_columns
+    ]
+    return mismatch_df.select(*key_cols + sorted(compare_columns))
 def _generate_agg_join_condition(source_alias: str, target_alias: str, key_columns: list[str]):

databricks/labs/lakebridge/reconcile/connectors/data_source.py CHANGED Viewed

@@ -3,6 +3,7 @@ from abc import ABC, abstractmethod
 from pyspark.sql import DataFrame
+from databricks.labs.lakebridge.reconcile.connectors.models import NormalizedIdentifier
 from databricks.labs.lakebridge.reconcile.exception import DataSourceRuntimeException
 from databricks.labs.lakebridge.reconcile.recon_config import JdbcReaderOptions, Schema
@@ -28,15 +29,34 @@ class DataSource(ABC):
         catalog: str | None,
         schema: str,
         table: str,
+        normalize: bool = True,
     ) -> list[Schema]:
         return NotImplemented
+    @abstractmethod
+    def normalize_identifier(self, identifier: str) -> NormalizedIdentifier:
+        pass
     @classmethod
     def log_and_throw_exception(cls, exception: Exception, fetch_type: str, query: str):
         error_msg = f"Runtime exception occurred while fetching {fetch_type} using {query} : {exception}"
         logger.warning(error_msg)
         raise DataSourceRuntimeException(error_msg) from exception
+    def _map_meta_column(self, meta_column, normalize: bool) -> Schema:
+        """Create a normalized Schema DTO from the database metadata
+        Used in the implementations of get_schema to build a Schema DTO from the `INFORMATION_SCHEMA` query result.
+        The returned Schema is normalized in case the database is having columns with special characters and standardize
+        """
+        name = meta_column.col_name.lower()
+        dtype = meta_column.data_type.strip().lower()
+        if normalize:
+            normalized = self.normalize_identifier(name)
+            return Schema(normalized.ansi_normalized, dtype, normalized.ansi_normalized, normalized.source_normalized)
+        return Schema(name, dtype, name, name)
 class MockDataSource(DataSource):
@@ -64,9 +84,12 @@ class MockDataSource(DataSource):
             return self.log_and_throw_exception(self._exception, "data", f"({catalog}, {schema}, {query})")
         return mock_df
-    def get_schema(self, catalog: str | None, schema: str, table: str) -> list[Schema]:
+    def get_schema(self, catalog: str | None, schema: str, table: str, normalize: bool = True) -> list[Schema]:
         catalog_str = catalog if catalog else ""
         mock_schema = self._schema_repository.get((catalog_str, schema, table))
         if not mock_schema:
             return self.log_and_throw_exception(self._exception, "schema", f"({catalog}, {schema}, {table})")
         return mock_schema
+    def normalize_identifier(self, identifier: str) -> NormalizedIdentifier:
+        return NormalizedIdentifier(identifier, identifier)

databricks/labs/lakebridge/reconcile/connectors/databricks.py CHANGED Viewed

@@ -8,7 +8,9 @@ from pyspark.sql.functions import col
 from sqlglot import Dialect
 from databricks.labs.lakebridge.reconcile.connectors.data_source import DataSource
+from databricks.labs.lakebridge.reconcile.connectors.models import NormalizedIdentifier
 from databricks.labs.lakebridge.reconcile.connectors.secrets import SecretsMixin
+from databricks.labs.lakebridge.reconcile.connectors.dialect_utils import DialectUtils
 from databricks.labs.lakebridge.reconcile.recon_config import JdbcReaderOptions, Schema
 from databricks.sdk import WorkspaceClient
@@ -35,6 +37,7 @@ def _get_schema_query(catalog: str, schema: str, table: str):
 class DatabricksDataSource(DataSource, SecretsMixin):
+    _IDENTIFIER_DELIMITER = "`"
     def __init__(
         self,
@@ -74,6 +77,7 @@ class DatabricksDataSource(DataSource, SecretsMixin):
         catalog: str | None,
         schema: str,
         table: str,
+        normalize: bool = True,
     ) -> list[Schema]:
         catalog_str = catalog if catalog else "hive_metastore"
         schema_query = _get_schema_query(catalog_str, schema, table)
@@ -82,6 +86,13 @@ class DatabricksDataSource(DataSource, SecretsMixin):
             logger.info(f"Fetching Schema: Started at: {datetime.now()}")
             schema_metadata = self._spark.sql(schema_query).where("col_name not like '#%'").distinct().collect()
             logger.info(f"Schema fetched successfully. Completed at: {datetime.now()}")
-            return [Schema(field.col_name.lower(), field.data_type.lower()) for field in schema_metadata]
+            return [self._map_meta_column(field, normalize) for field in schema_metadata]
         except (RuntimeError, PySparkException) as e:
             return self.log_and_throw_exception(e, "schema", schema_query)
+    def normalize_identifier(self, identifier: str) -> NormalizedIdentifier:
+        return DialectUtils.normalize_identifier(
+            identifier,
+            source_start_delimiter=DatabricksDataSource._IDENTIFIER_DELIMITER,
+            source_end_delimiter=DatabricksDataSource._IDENTIFIER_DELIMITER,
+        )

databricks/labs/lakebridge/reconcile/connectors/dialect_utils.py ADDED Viewed

@@ -0,0 +1,126 @@
+from databricks.labs.lakebridge.reconcile.connectors.models import NormalizedIdentifier
+class DialectUtils:
+    _ANSI_IDENTIFIER_DELIMITER = "`"
+    @staticmethod
+    def unnormalize_identifier(identifier: str) -> str:
+        """Return an ansi identifier without the outer backticks.
+        Use this at your own risk as the missing outer backticks will result in bugs.
+        E.g. <`mary's lamb`> is returned <mary's lamb> so the outer backticks are needed.
+        This is useful for scenarios where the returned identifier will be part of another delimited identifier.
+        :param identifier: a database identifier
+        :return: ansi identifier without the outer backticks
+        """
+        ansi = DialectUtils.ansi_normalize_identifier(identifier)
+        unescape = (
+            DialectUtils._unescape_source_end_delimiter(ansi[1:-1], DialectUtils._ANSI_IDENTIFIER_DELIMITER)
+            if ansi
+            else ansi
+        )
+        return unescape
+    @staticmethod
+    def ansi_normalize_identifier(identifier: str) -> str:
+        return DialectUtils.normalize_identifier(
+            identifier, DialectUtils._ANSI_IDENTIFIER_DELIMITER, DialectUtils._ANSI_IDENTIFIER_DELIMITER
+        ).ansi_normalized
+    @staticmethod
+    def normalize_identifier(
+        identifier: str, source_start_delimiter: str, source_end_delimiter: str
+    ) -> NormalizedIdentifier:
+        identifier = identifier.strip().lower()
+        ansi = DialectUtils._normalize_identifier_source_agnostic(
+            identifier,
+            source_start_delimiter,
+            source_end_delimiter,
+            DialectUtils._ANSI_IDENTIFIER_DELIMITER,
+            DialectUtils._ANSI_IDENTIFIER_DELIMITER,
+        )
+        # Input was already ansi normalized
+        if ansi == identifier:
+            source = DialectUtils._normalize_identifier_source_agnostic(
+                identifier,
+                DialectUtils._ANSI_IDENTIFIER_DELIMITER,
+                DialectUtils._ANSI_IDENTIFIER_DELIMITER,
+                source_start_delimiter,
+                source_end_delimiter,
+            )
+            # Ansi has backticks escaped which has to be unescaped for other delimiters and escape source end delimiters
+            if source != ansi:
+                source = DialectUtils._unescape_source_end_delimiter(source, DialectUtils._ANSI_IDENTIFIER_DELIMITER)
+                source = (
+                    DialectUtils._escape_source_end_delimiter(source, source_start_delimiter, source_end_delimiter)
+                    if source
+                    else source
+                )
+        else:
+            # Make sure backticks are escaped properly for ansi and source end delimiters are unescaped
+            ansi = DialectUtils._unescape_source_end_delimiter(ansi, source_end_delimiter)
+            ansi = DialectUtils._escape_backticks(ansi) if ansi else ansi
+            if source_end_delimiter != DialectUtils._ANSI_IDENTIFIER_DELIMITER:
+                ansi = DialectUtils._unescape_source_end_delimiter(ansi, source_end_delimiter)
+            source = DialectUtils._normalize_identifier_source_agnostic(
+                identifier, source_start_delimiter, source_end_delimiter, source_start_delimiter, source_end_delimiter
+            )
+            # Make sure source end delimiter is escaped else nothing as it was already normalized
+            if source != identifier:
+                source = (
+                    DialectUtils._escape_source_end_delimiter(source, source_start_delimiter, source_end_delimiter)
+                    if source
+                    else source
+                )
+        return NormalizedIdentifier(ansi, source)
+    @staticmethod
+    def _normalize_identifier_source_agnostic(
+        identifier: str,
+        source_start_delimiter: str,
+        source_end_delimiter: str,
+        expected_source_start_delimiter: str,
+        expected_source_end_delimiter: str,
+    ) -> str:
+        if identifier == "" or identifier is None:
+            return ""
+        if DialectUtils.is_already_delimited(
+            identifier, expected_source_start_delimiter, expected_source_end_delimiter
+        ):
+            return identifier
+        if DialectUtils.is_already_delimited(identifier, source_start_delimiter, source_end_delimiter):
+            stripped_identifier = identifier.removeprefix(source_start_delimiter).removesuffix(source_end_delimiter)
+        else:
+            stripped_identifier = identifier
+        return f"{expected_source_start_delimiter}{stripped_identifier}{expected_source_end_delimiter}"
+    @staticmethod
+    def is_already_delimited(identifier: str, start_delimiter: str, end_delimiter: str) -> bool:
+        return identifier.startswith(start_delimiter) and identifier.endswith(end_delimiter)
+    @staticmethod
+    def _escape_backticks(identifier: str) -> str:
+        identifier = identifier[1:-1]
+        identifier = identifier.replace("`", "``")
+        return f"`{identifier}`"
+    @staticmethod
+    def _unescape_source_end_delimiter(identifier: str, source_end_delimiter: str) -> str:
+        return identifier.replace(f"{source_end_delimiter}{source_end_delimiter}", source_end_delimiter)
+    @staticmethod
+    def _escape_source_end_delimiter(identifier: str, start_end_delimiter, source_end_delimiter: str) -> str:
+        identifier = identifier[1:-1]
+        identifier = identifier.replace(source_end_delimiter, f"{source_end_delimiter}{source_end_delimiter}")
+        return f"{start_end_delimiter}{identifier}{source_end_delimiter}"

databricks/labs/lakebridge/reconcile/connectors/models.py ADDED Viewed

@@ -0,0 +1,7 @@
+import dataclasses
+@dataclasses.dataclass
+class NormalizedIdentifier:
+    ansi_normalized: str
+    source_normalized: str

databricks/labs/lakebridge/reconcile/connectors/oracle.py CHANGED Viewed

@@ -9,7 +9,9 @@ from sqlglot import Dialect
 from databricks.labs.lakebridge.reconcile.connectors.data_source import DataSource
 from databricks.labs.lakebridge.reconcile.connectors.jdbc_reader import JDBCReaderMixin
+from databricks.labs.lakebridge.reconcile.connectors.models import NormalizedIdentifier
 from databricks.labs.lakebridge.reconcile.connectors.secrets import SecretsMixin
+from databricks.labs.lakebridge.reconcile.connectors.dialect_utils import DialectUtils
 from databricks.labs.lakebridge.reconcile.recon_config import JdbcReaderOptions, Schema
 from databricks.sdk import WorkspaceClient
@@ -18,6 +20,7 @@ logger = logging.getLogger(__name__)
 class OracleDataSource(DataSource, SecretsMixin, JDBCReaderMixin):
     _DRIVER = "oracle"
+    _IDENTIFIER_DELIMITER = "\""
     _SCHEMA_QUERY = """select column_name, case when (data_precision is not null
                                               and data_scale <> 0)
                                               then data_type || '(' || data_precision || ',' || data_scale || ')'
@@ -78,6 +81,7 @@ class OracleDataSource(DataSource, SecretsMixin, JDBCReaderMixin):
         catalog: str | None,
         schema: str,
         table: str,
+        normalize: bool = True,
     ) -> list[Schema]:
         schema_query = re.sub(
             r'\s+',
@@ -91,7 +95,7 @@ class OracleDataSource(DataSource, SecretsMixin, JDBCReaderMixin):
             schema_metadata = df.select([col(c).alias(c.lower()) for c in df.columns]).collect()
             logger.info(f"Schema fetched successfully. Completed at: {datetime.now()}")
             logger.debug(f"schema_metadata: ${schema_metadata}")
-            return [Schema(field.column_name.lower(), field.data_type.lower()) for field in schema_metadata]
+            return [self._map_meta_column(field, normalize) for field in schema_metadata]
         except (RuntimeError, PySparkException) as e:
             return self.log_and_throw_exception(e, "schema", schema_query)
@@ -106,3 +110,10 @@ class OracleDataSource(DataSource, SecretsMixin, JDBCReaderMixin):
     def reader(self, query: str) -> DataFrameReader:
         return self._get_jdbc_reader(query, self.get_jdbc_url, OracleDataSource._DRIVER)
+    def normalize_identifier(self, identifier: str) -> NormalizedIdentifier:
+        return DialectUtils.normalize_identifier(
+            identifier,
+            source_start_delimiter=OracleDataSource._IDENTIFIER_DELIMITER,
+            source_end_delimiter=OracleDataSource._IDENTIFIER_DELIMITER,
+        )

databricks/labs/lakebridge/reconcile/connectors/secrets.py CHANGED Viewed

@@ -11,8 +11,26 @@ class SecretsMixin:
     _ws: WorkspaceClient
     _secret_scope: str
+    def _get_secret_or_none(self, secret_key: str) -> str | None:
+        """
+        Get the secret value given a secret scope & secret key. Log a warning if secret does not exist
+        Used To ensure backwards compatibility when supporting new secrets
+        """
+        try:
+            # Return the decoded secret value in string format
+            return self._get_secret(secret_key)
+        except NotFound as e:
+            logger.warning(f"Secret not found: key={secret_key}")
+            logger.debug("Secret lookup failed", exc_info=e)
+            return None
     def _get_secret(self, secret_key: str) -> str:
-        """Get the secret value given a secret scope & secret key. Log a warning if secret does not exist"""
+        """Get the secret value given a secret scope & secret key.
+        Raises:
+          NotFound: The secret could not be found.
+          UnicodeDecodeError: The secret value was not Base64-encoded UTF-8.
+        """
         try:
             # Return the decoded secret value in string format
             secret = self._ws.secrets.get_secret(self._secret_scope, secret_key)

databricks/labs/lakebridge/reconcile/connectors/snowflake.py CHANGED Viewed

@@ -11,7 +11,9 @@ from cryptography.hazmat.primitives import serialization
 from databricks.labs.lakebridge.reconcile.connectors.data_source import DataSource
 from databricks.labs.lakebridge.reconcile.connectors.jdbc_reader import JDBCReaderMixin
+from databricks.labs.lakebridge.reconcile.connectors.models import NormalizedIdentifier
 from databricks.labs.lakebridge.reconcile.connectors.secrets import SecretsMixin
+from databricks.labs.lakebridge.reconcile.connectors.dialect_utils import DialectUtils
 from databricks.labs.lakebridge.reconcile.exception import InvalidSnowflakePemPrivateKey
 from databricks.labs.lakebridge.reconcile.recon_config import JdbcReaderOptions, Schema
 from databricks.sdk import WorkspaceClient
@@ -22,6 +24,8 @@ logger = logging.getLogger(__name__)
 class SnowflakeDataSource(DataSource, SecretsMixin, JDBCReaderMixin):
     _DRIVER = "snowflake"
+    _IDENTIFIER_DELIMITER = "\""
     """
        * INFORMATION_SCHEMA:
           - see https://docs.snowflake.com/en/sql-reference/info-schema#considerations-for-replacing-show-commands-with-information-schema-views
@@ -75,31 +79,6 @@ class SnowflakeDataSource(DataSource, SecretsMixin, JDBCReaderMixin):
             f"&warehouse={self._get_secret('sfWarehouse')}&role={self._get_secret('sfRole')}"
         )
-    @staticmethod
-    def get_private_key(pem_private_key: str) -> str:
-        try:
-            private_key_bytes = pem_private_key.encode("UTF-8")
-            p_key = serialization.load_pem_private_key(
-                private_key_bytes,
-                password=None,
-                backend=default_backend(),
-            )
-            pkb = p_key.private_bytes(
-                encoding=serialization.Encoding.PEM,
-                format=serialization.PrivateFormat.PKCS8,
-                encryption_algorithm=serialization.NoEncryption(),
-            )
-            pkb_str = pkb.decode("UTF-8")
-            # Remove the first and last lines (BEGIN/END markers)
-            private_key_pem_lines = pkb_str.strip().split('\n')[1:-1]
-            # Join the lines to form the base64 encoded string
-            private_key_pem_str = ''.join(private_key_pem_lines)
-            return private_key_pem_str
-        except Exception as e:
-            message = f"Failed to load or process the provided PEM private key. --> {e}"
-            logger.error(message)
-            raise InvalidSnowflakePemPrivateKey(message) from e
     def read_data(
         self,
         catalog: str | None,
@@ -128,6 +107,7 @@ class SnowflakeDataSource(DataSource, SecretsMixin, JDBCReaderMixin):
         catalog: str | None,
         schema: str,
         table: str,
+        normalize: bool = True,
     ) -> list[Schema]:
         """
         Fetch the Schema from the INFORMATION_SCHEMA.COLUMNS table in Snowflake.
@@ -144,13 +124,20 @@ class SnowflakeDataSource(DataSource, SecretsMixin, JDBCReaderMixin):
         try:
             logger.debug(f"Fetching schema using query: \n`{schema_query}`")
             logger.info(f"Fetching Schema: Started at: {datetime.now()}")
-            schema_metadata = self.reader(schema_query).load().collect()
+            df = self.reader(schema_query).load()
+            schema_metadata = df.select([col(c).alias(c.lower()) for c in df.columns]).collect()
             logger.info(f"Schema fetched successfully. Completed at: {datetime.now()}")
-            return [Schema(field.COLUMN_NAME.lower(), field.DATA_TYPE.lower()) for field in schema_metadata]
+            return [self._map_meta_column(field, normalize) for field in schema_metadata]
         except (RuntimeError, PySparkException) as e:
             return self.log_and_throw_exception(e, "schema", schema_query)
     def reader(self, query: str) -> DataFrameReader:
+        options = self._get_snowflake_options()
+        return self._spark.read.format("snowflake").option("dbtable", f"({query}) as tmp").options(**options)
+    # TODO cache this method using @functools.cache
+    # Pay attention to https://pylint.pycqa.org/en/latest/user_guide/messages/warning/method-cache-max-size-none.html
+    def _get_snowflake_options(self):
         options = {
             "sfUrl": self._get_secret('sfUrl'),
             "sfUser": self._get_secret('sfUser'),
@@ -159,15 +146,61 @@ class SnowflakeDataSource(DataSource, SecretsMixin, JDBCReaderMixin):
             "sfWarehouse": self._get_secret('sfWarehouse'),
             "sfRole": self._get_secret('sfRole'),
         }
+        options = options | self._get_snowflake_auth_options()
+        return options
+    def _get_snowflake_auth_options(self):
         try:
-            options["pem_private_key"] = SnowflakeDataSource.get_private_key(self._get_secret('pem_private_key'))
+            key = SnowflakeDataSource._get_private_key(
+                self._get_secret('pem_private_key'), self._get_secret_or_none('pem_private_key_password')
+            )
+            return {"pem_private_key": key}
         except (NotFound, KeyError):
             logger.warning("pem_private_key not found. Checking for sfPassword")
             try:
-                options["sfPassword"] = self._get_secret('sfPassword')
+                password = self._get_secret('sfPassword')
+                return {"sfPassword": password}
             except (NotFound, KeyError) as e:
                 message = "sfPassword and pem_private_key not found. Either one is required for snowflake auth."
                 logger.error(message)
                 raise NotFound(message) from e
-        return self._spark.read.format("snowflake").option("dbtable", f"({query}) as tmp").options(**options)
+    @staticmethod
+    def _get_private_key(pem_private_key: str, pem_private_key_password: str | None) -> str:
+        try:
+            private_key_bytes = pem_private_key.encode("UTF-8")
+            password_bytes = pem_private_key_password.encode("UTF-8") if pem_private_key_password else None
+        except UnicodeEncodeError as e:
+            message = f"Invalid pem key and/or pem password: unable to encode. --> {e}"
+            logger.error(message)
+            raise ValueError(message) from e
+        try:
+            p_key = serialization.load_pem_private_key(
+                private_key_bytes,
+                password_bytes,
+                backend=default_backend(),
+            )
+            pkb = p_key.private_bytes(
+                encoding=serialization.Encoding.PEM,
+                format=serialization.PrivateFormat.PKCS8,
+                encryption_algorithm=serialization.NoEncryption(),
+            )
+            pkb_str = pkb.decode("UTF-8")
+            # Remove the first and last lines (BEGIN/END markers)
+            private_key_pem_lines = pkb_str.strip().split('\n')[1:-1]
+            # Join the lines to form the base64 encoded string
+            private_key_pem_str = ''.join(private_key_pem_lines)
+            return private_key_pem_str
+        except Exception as e:
+            message = f"Failed to load or process the provided PEM private key. --> {e}"
+            logger.error(message)
+            raise InvalidSnowflakePemPrivateKey(message) from e
+    def normalize_identifier(self, identifier: str) -> NormalizedIdentifier:
+        return DialectUtils.normalize_identifier(
+            identifier,
+            source_start_delimiter=SnowflakeDataSource._IDENTIFIER_DELIMITER,
+            source_end_delimiter=SnowflakeDataSource._IDENTIFIER_DELIMITER,
+        )

databricks-labs-lakebridge 0.10.6__py3-none-any.whl → 0.10.8__py3-none-any.whl

databricks-labs-lakebridge 0.10.6py3-none-any.whl → 0.10.8py3-none-any.whl