PyPI - databricks-labs-lakebridge - Versions diffs - 0.10.0__py3-none-any.whl - Mend

databricks-labs-lakebridge 0.10.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (171) hide show

databricks/labs/lakebridge/reconcile/query_builder/expression_generator.py ADDED Viewed

@@ -0,0 +1,292 @@
+from collections.abc import Callable
+from functools import partial
+from pyspark.sql.types import DataType, NumericType
+from sqlglot import Dialect
+from sqlglot import expressions as exp
+from databricks.labs.lakebridge.transpiler.sqlglot.dialect_utils import get_dialect
+from databricks.labs.lakebridge.reconcile.recon_config import HashAlgoMapping
+def _apply_func_expr(expr: exp.Expression, expr_func: Callable, **kwargs) -> exp.Expression:
+    is_terminal = isinstance(expr, exp.Column)
+    new_expr = expr.copy()
+    for node in new_expr.dfs():
+        if isinstance(node, exp.Column):
+            column_name = node.name
+            table_name = node.table
+            func = expr_func(this=exp.Column(this=column_name, table=table_name), **kwargs)
+            if is_terminal:
+                return func
+            node.replace(func)
+    return new_expr
+def concat(expr: list[exp.Expression]) -> exp.Expression:
+    return exp.Concat(expressions=expr, safe=True)
+def sha2(expr: exp.Expression, num_bits: str, is_expr: bool = False) -> exp.Expression:
+    if is_expr:
+        return exp.SHA2(this=expr, length=exp.Literal(this=num_bits, is_string=False))
+    return _apply_func_expr(expr, exp.SHA2, length=exp.Literal(this=num_bits, is_string=False))
+def md5(expr: exp.Expression, is_expr: bool = False) -> exp.Expression:
+    if is_expr:
+        return exp.MD5(this=expr)
+    return _apply_func_expr(expr, exp.MD5)
+def lower(expr: exp.Expression, is_expr: bool = False) -> exp.Expression:
+    if is_expr:
+        return exp.Lower(this=expr)
+    return _apply_func_expr(expr, exp.Lower)
+def coalesce(expr: exp.Expression, default="0", is_string=False) -> exp.Expression:
+    expressions = [exp.Literal(this=default, is_string=is_string)]
+    return _apply_func_expr(expr, exp.Coalesce, expressions=expressions)
+def trim(expr: exp.Expression) -> exp.Trim | exp.Expression:
+    return _apply_func_expr(expr, exp.Trim)
+def json_format(expr: exp.Expression, options: dict[str, str] | None = None) -> exp.Expression:
+    return _apply_func_expr(expr, exp.JSONFormat, options=options)
+def sort_array(expr: exp.Expression, asc=True) -> exp.Expression:
+    return _apply_func_expr(expr, exp.SortArray, asc=exp.Boolean(this=asc))
+def to_char(expr: exp.Expression, to_format=None, nls_param=None) -> exp.Expression:
+    if to_format:
+        return _apply_func_expr(
+            expr, exp.ToChar, format=exp.Literal(this=to_format, is_string=True), nls_param=nls_param
+        )
+    return _apply_func_expr(expr, exp.ToChar)
+def array_to_string(
+    expr: exp.Expression,
+    delimiter: str = ",",
+    is_string=True,
+    null_replacement: str | None = None,
+    is_null_replace=True,
+) -> exp.Expression:
+    if null_replacement:
+        return _apply_func_expr(
+            expr,
+            exp.ArrayToString,
+            expression=[exp.Literal(this=delimiter, is_string=is_string)],
+            null=exp.Literal(this=null_replacement, is_string=is_null_replace),
+        )
+    return _apply_func_expr(expr, exp.ArrayToString, expression=[exp.Literal(this=delimiter, is_string=is_string)])
+def array_sort(expr: exp.Expression, asc=True) -> exp.Expression:
+    return _apply_func_expr(expr, exp.ArraySort, expression=exp.Boolean(this=asc))
+def anonymous(expr: exp.Column, func: str, is_expr: bool = False, dialect=None) -> exp.Expression:
+    """
+    This function used in cases where the sql functions are not available in sqlGlot expressions
+    Example:
+        >>> from sqlglot import parse_one
+        >>> print(repr(parse_one('select unix_timestamp(col1)')))
+    the above code gives you a Select Expression of Anonymous function.
+    To achieve the same,we can use the function as below:
+    eg:
+        >>> expr = parse_one("select col1 from dual")
+        >>> transformed_expr=anonymous(expr,"unix_timestamp({})")
+        >>> print(transformed_expr)
+        'SELECT UNIX_TIMESTAMP(col1) FROM DUAL'
+    """
+    if is_expr:
+        if dialect:
+            return exp.Column(this=func.format(expr.sql(dialect=dialect)))
+        return exp.Column(this=func.format(expr))
+    is_terminal = isinstance(expr, exp.Column)
+    new_expr = expr.copy()
+    for node in new_expr.dfs():
+        if isinstance(node, exp.Column):
+            name = f"{node.table}.{node.name}" if node.table else node.name
+            anonymous_func = exp.Column(this=func.format(name))
+            if is_terminal:
+                return anonymous_func
+            node.replace(anonymous_func)
+    return new_expr
+def build_column(this: exp.ExpOrStr, table_name="", quoted=False, alias=None) -> exp.Expression:
+    if alias:
+        if isinstance(this, str):
+            return exp.Alias(
+                this=exp.Column(this=this, table=table_name), alias=exp.Identifier(this=alias, quoted=quoted)
+            )
+        return exp.Alias(this=this, alias=exp.Identifier(this=alias, quoted=quoted))
+    return exp.Column(this=exp.Identifier(this=this, quoted=quoted), table=table_name)
+def build_literal(this: exp.ExpOrStr, alias=None, quoted=False, is_string=True, cast=None) -> exp.Expression:
+    base_literal = exp.Literal(this=this, is_string=is_string)
+    if not cast and not alias:
+        return base_literal
+    cast_expr = exp.Cast(this=base_literal, to=exp.DataType(this=cast)) if cast else base_literal
+    return exp.Alias(this=cast_expr, alias=exp.Identifier(this=alias, quoted=quoted)) if alias else cast_expr
+def transform_expression(
+    expr: exp.Expression,
+    funcs: list[Callable[[exp.Expression], exp.Expression]],
+) -> exp.Expression:
+    for func in funcs:
+        expr = func(expr)
+    assert isinstance(expr, exp.Expression), (
+        f"Func returned an instance of type [{type(expr)}], " "should have been Expression."
+    )
+    return expr
+def get_hash_transform(
+    source: Dialect,
+    layer: str,
+):
+    dialect_algo = Dialect_hash_algo_mapping.get(source)
+    if not dialect_algo:
+        raise ValueError(f"Source {source} is not supported. Please add it to Dialect_hash_algo_mapping dictionary.")
+    layer_algo = getattr(dialect_algo, layer, None)
+    if not layer_algo:
+        raise ValueError(
+            f"Layer {layer} is not supported for source {source}. Please add it to Dialect_hash_algo_mapping dictionary."
+        )
+    return [layer_algo]
+def build_from_clause(table_name: str, table_alias: str | None = None) -> exp.From:
+    return exp.From(this=exp.Table(this=exp.Identifier(this=table_name), alias=table_alias))
+def build_join_clause(
+    table_name: str,
+    join_columns: list,
+    source_table_alias: str | None = None,
+    target_table_alias: str | None = None,
+    kind: str = "inner",
+    func: Callable = exp.NullSafeEQ,
+) -> exp.Join:
+    join_conditions = []
+    for column in join_columns:
+        join_condition = func(
+            this=exp.Column(this=column, table=source_table_alias),
+            expression=exp.Column(this=column, table=target_table_alias),
+        )
+        join_conditions.append(join_condition)
+    # Combine all join conditions with AND
+    on_condition: exp.NullSafeEQ | exp.And = join_conditions[0]
+    for condition in join_conditions[1:]:
+        on_condition = exp.And(this=on_condition, expression=condition)
+    return exp.Join(
+        this=exp.Table(this=exp.Identifier(this=table_name), alias=target_table_alias), kind=kind, on=on_condition
+    )
+def build_sub(
+    left_column_name: str,
+    right_column_name: str,
+    left_table_name: str | None = None,
+    right_table_name: str | None = None,
+) -> exp.Sub:
+    return exp.Sub(
+        this=build_column(left_column_name, left_table_name),
+        expression=build_column(right_column_name, right_table_name),
+    )
+def build_where_clause(where_clause: list[exp.Expression], condition_type: str = "or") -> exp.Expression:
+    func = exp.Or if condition_type == "or" else exp.And
+    # Start with a default
+    combined_expression: exp.Expression = exp.Paren(this=func(this='1 = 1', expression='1 = 1'))
+    # Loop through the expressions and combine them with OR
+    for expression in where_clause:
+        combined_expression = func(this=combined_expression, expression=expression)
+    return combined_expression
+def build_if(this: exp.Expression, true: exp.Expression, false: exp.Expression | None = None) -> exp.If:
+    return exp.If(this=this, true=true, false=false)
+def build_between(this: exp.Expression, low: exp.Expression, high: exp.Expression) -> exp.Between:
+    return exp.Between(this=this, low=low, high=high)
+def _get_is_string(column_types_dict: dict[str, DataType], column_name: str) -> bool:
+    if isinstance(column_types_dict.get(column_name), NumericType):
+        return False
+    return True
+DataType_transform_mapping: dict[str, dict[str, list[partial[exp.Expression]]]] = {
+    "universal": {"default": [partial(coalesce, default='_null_recon_', is_string=True), partial(trim)]},
+    "snowflake": {exp.DataType.Type.ARRAY.value: [partial(array_to_string), partial(array_sort)]},
+    "oracle": {
+        exp.DataType.Type.NCHAR.value: [
+            partial(anonymous, func="NVL(TRIM(TO_CHAR({})),'_null_recon_')", dialect=get_dialect("oracle"))
+        ],
+        exp.DataType.Type.NVARCHAR.value: [
+            partial(anonymous, func="NVL(TRIM(TO_CHAR({})),'_null_recon_')", dialect=get_dialect("oracle"))
+        ],
+    },
+    "databricks": {
+        exp.DataType.Type.ARRAY.value: [
+            partial(anonymous, func="CONCAT_WS(',', SORT_ARRAY({}))", dialect=get_dialect("databricks"))
+        ],
+    },
+    "tsql": {
+        "default": [partial(anonymous, func="COALESCE(LTRIM(RTRIM(CAST([{}] AS VARCHAR(256)))), '_null_recon_')")],
+        exp.DataType.Type.DATE.value: [partial(anonymous, func="COALESCE(CONVERT(DATE, {0}, 101), '1900-01-01')")],
+        exp.DataType.Type.TIME.value: [partial(anonymous, func="COALESCE(CONVERT(TIME, {0}, 108), '00:00:00')")],
+        exp.DataType.Type.DATETIME.value: [
+            partial(anonymous, func="COALESCE(CONVERT(DATETIME, {0}, 120), '1900-01-01 00:00:00')")
+        ],
+    },
+}
+sha256_partial = partial(sha2, num_bits="256", is_expr=True)
+md5_partial = partial(md5, is_expr=True)
+Dialect_hash_algo_mapping: dict[Dialect, HashAlgoMapping] = {
+    get_dialect("snowflake"): HashAlgoMapping(
+        source=sha256_partial,
+        target=sha256_partial,
+    ),
+    get_dialect("oracle"): HashAlgoMapping(
+        source=partial(
+            anonymous, func="DBMS_CRYPTO.HASH(RAWTOHEX({}), 2)", is_expr=True, dialect=get_dialect("oracle")
+        ),
+        target=md5_partial,
+    ),
+    get_dialect("databricks"): HashAlgoMapping(
+        source=sha256_partial,
+        target=sha256_partial,
+    ),
+    get_dialect("tsql"): HashAlgoMapping(
+        source=partial(
+            anonymous, func="CONVERT(VARCHAR(256), HASHBYTES('SHA2_256', CONVERT(VARCHAR(256),{})), 2)", is_expr=True
+        ),
+        target=sha256_partial,
+    ),
+}

databricks/labs/lakebridge/reconcile/query_builder/hash_query.py ADDED Viewed

@@ -0,0 +1,91 @@
+import logging
+from functools import reduce
+import sqlglot.expressions as exp
+from sqlglot import Dialect
+from databricks.labs.lakebridge.reconcile.query_builder.base import QueryBuilder
+from databricks.labs.lakebridge.reconcile.query_builder.expression_generator import (
+    build_column,
+    concat,
+    get_hash_transform,
+    lower,
+    transform_expression,
+)
+from databricks.labs.lakebridge.transpiler.sqlglot.dialect_utils import get_dialect
+logger = logging.getLogger(__name__)
+def _hash_transform(
+    node: exp.Expression,
+    source: Dialect,
+    layer: str,
+):
+    transform = get_hash_transform(source, layer)
+    return transform_expression(node, transform)
+_HASH_COLUMN_NAME = "hash_value_recon"
+class HashQueryBuilder(QueryBuilder):
+    def build_query(self, report_type: str) -> str:
+        if report_type != 'row':
+            self._validate(self.join_columns, f"Join Columns are compulsory for {report_type} type")
+        _join_columns = self.join_columns if self.join_columns else set()
+        hash_cols = sorted((_join_columns | self.select_columns) - self.threshold_columns - self.drop_columns)
+        key_cols = hash_cols if report_type == "row" else sorted(_join_columns | self.partition_column)
+        cols_with_alias = [
+            build_column(this=col, alias=self.table_conf.get_layer_tgt_to_src_col_mapping(col, self.layer))
+            for col in key_cols
+        ]
+        # in case if we have column mapping, we need to sort the target columns in the order of source columns to get
+        # same hash value
+        hash_cols_with_alias = [
+            {"this": col, "alias": self.table_conf.get_layer_tgt_to_src_col_mapping(col, self.layer)}
+            for col in hash_cols
+        ]
+        sorted_hash_cols_with_alias = sorted(hash_cols_with_alias, key=lambda column: column["alias"])
+        hashcols_sorted_as_src_seq = [column["this"] for column in sorted_hash_cols_with_alias]
+        key_cols_with_transform = (
+            self._apply_user_transformation(cols_with_alias) if self.user_transformations else cols_with_alias
+        )
+        hash_col_with_transform = [self._generate_hash_algorithm(hashcols_sorted_as_src_seq, _HASH_COLUMN_NAME)]
+        dialect = self.engine if self.layer == "source" else get_dialect("databricks")
+        res = (
+            exp.select(*hash_col_with_transform + key_cols_with_transform)
+            .from_(":tbl")
+            .where(self.filter)
+            .sql(dialect=dialect)
+        )
+        logger.info(f"Hash Query for {self.layer}: {res}")
+        return res
+    def _generate_hash_algorithm(
+        self,
+        cols: list[str],
+        column_alias: str,
+    ) -> exp.Expression:
+        cols_with_alias = [build_column(this=col, alias=None) for col in cols]
+        cols_with_transform = self.add_transformations(
+            cols_with_alias, self.engine if self.layer == "source" else get_dialect("databricks")
+        )
+        col_exprs = exp.select(*cols_with_transform).iter_expressions()
+        concat_expr = concat(list(col_exprs))
+        if self.engine == "oracle":
+            concat_expr = reduce(lambda x, y: exp.DPipe(this=x, expression=y), concat_expr.expressions)
+        hash_expr = concat_expr.transform(_hash_transform, self.engine, self.layer).transform(lower, is_expr=True)
+        return build_column(hash_expr, alias=column_alias)

databricks/labs/lakebridge/reconcile/query_builder/sampling_query.py ADDED Viewed

@@ -0,0 +1,123 @@
+import logging
+import sqlglot.expressions as exp
+from pyspark.sql import DataFrame
+from sqlglot import select
+from databricks.labs.lakebridge.transpiler.sqlglot.dialect_utils import get_key_from_dialect
+from databricks.labs.lakebridge.reconcile.query_builder.base import QueryBuilder
+from databricks.labs.lakebridge.reconcile.query_builder.expression_generator import (
+    build_column,
+    build_literal,
+    _get_is_string,
+    build_join_clause,
+)
+logger = logging.getLogger(__name__)
+def _union_concat(
+    unions: list[exp.Select],
+    result: exp.Union | exp.Select,
+    cnt=0,
+) -> exp.Select | exp.Union:
+    if len(unions) == 1:
+        return result
+    if cnt == len(unions) - 2:
+        return exp.union(result, unions[cnt + 1])
+    cnt = cnt + 1
+    res = exp.union(result, unions[cnt])
+    return _union_concat(unions, res, cnt)
+class SamplingQueryBuilder(QueryBuilder):
+    def build_query_with_alias(self):
+        self._validate(self.join_columns, "Join Columns are compulsory for sampling query")
+        join_columns = self.join_columns if self.join_columns else set()
+        cols = sorted((join_columns | self.select_columns) - self.threshold_columns - self.drop_columns)
+        cols_with_alias = [
+            build_column(this=col, alias=self.table_conf.get_layer_tgt_to_src_col_mapping(col, self.layer))
+            for col in cols
+        ]
+        query = select(*cols_with_alias).from_(":tbl").where(self.filter).sql(dialect=self.engine)
+        logger.info(f"Sampling Query with Alias for {self.layer}: {query}")
+        return query
+    def build_query(self, df: DataFrame):
+        self._validate(self.join_columns, "Join Columns are compulsory for sampling query")
+        join_columns = self.join_columns if self.join_columns else set()
+        if self.layer == "source":
+            key_cols = sorted(join_columns)
+        else:
+            key_cols = sorted(self.table_conf.get_tgt_to_src_col_mapping_list(join_columns))
+        keys_df = df.select(*key_cols)
+        with_clause = self._get_with_clause(keys_df)
+        cols = sorted((join_columns | self.select_columns) - self.threshold_columns - self.drop_columns)
+        cols_with_alias = [
+            build_column(this=col, alias=self.table_conf.get_layer_tgt_to_src_col_mapping(col, self.layer))
+            for col in cols
+        ]
+        sql_with_transforms = self.add_transformations(cols_with_alias, self.engine)
+        query_sql = select(*sql_with_transforms).from_(":tbl").where(self.filter)
+        if self.layer == "source":
+            with_select = [build_column(this=col, table_name="src") for col in sorted(cols)]
+        else:
+            with_select = [
+                build_column(this=col, table_name="src")
+                for col in sorted(self.table_conf.get_tgt_to_src_col_mapping_list(cols))
+            ]
+        join_clause = SamplingQueryBuilder._get_join_clause(key_cols)
+        query = (
+            with_clause.with_(alias="src", as_=query_sql)
+            .select(*with_select)
+            .from_("src")
+            .join(join_clause)
+            .sql(dialect=self.engine)
+        )
+        logger.info(f"Sampling Query for {self.layer}: {query}")
+        return query
+    @classmethod
+    def _get_join_clause(cls, key_cols: list):
+        return build_join_clause(
+            "recon", key_cols, source_table_alias="src", target_table_alias="recon", kind="inner", func=exp.EQ
+        )
+    def _get_with_clause(self, df: DataFrame) -> exp.Select:
+        union_res = []
+        for row in df.collect():
+            column_types = [(str(f.name).lower(), f.dataType) for f in df.schema.fields]
+            column_types_dict = dict(column_types)
+            orig_types_dict = {
+                schema.column_name: schema.data_type
+                for schema in self.schema
+                if schema.column_name not in self.user_transformations
+            }
+            row_select = [
+                (
+                    build_literal(
+                        this=str(value),
+                        alias=col,
+                        is_string=_get_is_string(column_types_dict, col),
+                        cast=orig_types_dict.get(col),
+                    )
+                    if value is not None
+                    else exp.Alias(this=exp.Null(), alias=col)
+                )
+                for col, value in zip(df.columns, row)
+            ]
+            if get_key_from_dialect(self.engine) == "oracle":
+                union_res.append(select(*row_select).from_("dual"))
+            else:
+                union_res.append(select(*row_select))
+        union_statements = _union_concat(union_res, union_res[0], 0)
+        return exp.Select().with_(alias='recon', as_=union_statements)