PyPI - databricks-labs-lakebridge - Versions diffs - 0.10.0__py3-none-any.whl - Mend

databricks-labs-lakebridge 0.10.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (171) hide show

databricks/labs/lakebridge/reconcile/recon_config.py ADDED Viewed

@@ -0,0 +1,363 @@
+from __future__ import annotations
+import logging
+from dataclasses import dataclass
+from collections.abc import Callable
+from sqlglot import expressions as exp
+from databricks.labs.lakebridge.reconcile.constants import SamplingOptionMethod, SamplingSpecificationsType
+logger = logging.getLogger(__name__)
+_SUPPORTED_AGG_TYPES: set[str] = {
+    "min",
+    "max",
+    "count",
+    "sum",
+    "avg",
+    "mean",
+    "mode",
+    "stddev",
+    "variance",
+    "median",
+}
+RECONCILE_OPERATION_NAME = "reconcile"
+AGG_RECONCILE_OPERATION_NAME = "aggregates-reconcile"
+class TableThresholdBoundsException(ValueError):
+    """Raise the error when the bounds for table threshold are invalid"""
+class InvalidModelForTableThreshold(ValueError):
+    """Raise the error when the model for table threshold is invalid"""
+@dataclass
+class HashAlgoMapping:
+    source: Callable
+    target: Callable
+@dataclass
+class SamplingSpecifications:
+    type: SamplingSpecificationsType
+    value: float
+    def __post_init__(self):
+        if not isinstance(self.type, SamplingSpecificationsType):
+            self.type = SamplingSpecificationsType(str(self.type).lower())
+        # Disabled
+        if self.type == SamplingSpecificationsType.FRACTION:
+            raise ValueError("SamplingSpecifications: 'FRACTION' type is disabled")
+        if self.type == SamplingSpecificationsType.FRACTION and (self.value is None or (not 0 < self.value < 1)):
+            raise ValueError("SamplingSpecifications: Fraction value must be greater than  0 and less than 1")
+@dataclass
+class SamplingOptions:
+    method: SamplingOptionMethod
+    specifications: SamplingSpecifications
+    stratified_columns: list[str] | None = None
+    stratified_buckets: int | None = None
+    def __post_init__(self):
+        if not isinstance(self.method, SamplingOptionMethod):
+            self.method = SamplingOptionMethod(str(self.method).lower())
+        if self.stratified_columns:
+            self.stratified_columns = [col.lower() for col in self.stratified_columns]
+        if self.method == SamplingOptionMethod.STRATIFIED:
+            if not self.stratified_columns or not self.stratified_buckets:
+                raise ValueError(
+                    "SamplingOptions : stratified_columns and stratified_buckets are required for STRATIFIED method"
+                )
+@dataclass
+class JdbcReaderOptions:
+    number_partitions: int | None = None
+    partition_column: str | None = None
+    lower_bound: str | None = None
+    upper_bound: str | None = None
+    fetch_size: int = 100
+    def __post_init__(self):
+        self.partition_column = self.partition_column.lower() if self.partition_column else None
+@dataclass
+class ColumnMapping:
+    source_name: str
+    target_name: str
+    def __post_init__(self):
+        self.source_name = self.source_name.lower()
+        self.target_name = self.target_name.lower()
+@dataclass
+class Transformation:
+    column_name: str
+    source: str | None = None
+    target: str | None = None
+    def __post_init__(self):
+        self.column_name = self.column_name.lower()
+@dataclass
+class ColumnThresholds:
+    column_name: str
+    lower_bound: str
+    upper_bound: str
+    type: str
+    def __post_init__(self):
+        self.column_name = self.column_name.lower()
+        self.type = self.type.lower()
+    def get_mode(self):
+        return "percentage" if "%" in self.lower_bound or "%" in self.upper_bound else "absolute"
+    def get_type(self):
+        if any(self.type in numeric_type.value.lower() for numeric_type in exp.DataType.NUMERIC_TYPES):
+            if self.get_mode() == "absolute":
+                return "number_absolute"
+            return "number_percentage"
+        if any(self.type in numeric_type.value.lower() for numeric_type in exp.DataType.TEMPORAL_TYPES):
+            return "datetime"
+        return None
+@dataclass
+class TableThresholds:
+    lower_bound: str
+    upper_bound: str
+    model: str
+    def __post_init__(self):
+        self.model = self.model.lower()
+        self.validate_threshold_bounds()
+        self.validate_threshold_model()
+    def get_mode(self):
+        return "percentage" if "%" in self.lower_bound or "%" in self.upper_bound else "absolute"
+    def validate_threshold_bounds(self):
+        lower_bound = int(self.lower_bound.replace("%", ""))
+        upper_bound = int(self.upper_bound.replace("%", ""))
+        if lower_bound < 0 or upper_bound < 0:
+            raise TableThresholdBoundsException("Threshold bounds for table cannot be negative.")
+        if lower_bound > upper_bound:
+            raise TableThresholdBoundsException("Lower bound cannot be greater than upper bound.")
+    def validate_threshold_model(self):
+        if self.model not in ["mismatch"]:
+            raise InvalidModelForTableThreshold(
+                f"Invalid model for Table Threshold: expected 'mismatch', but got '{self.model}'."
+            )
+@dataclass
+class Filters:
+    source: str | None = None
+    target: str | None = None
+def to_lower_case(input_list: list[str]) -> list[str]:
+    return [element.lower() for element in input_list]
+@dataclass
+class Table:
+    source_name: str
+    target_name: str
+    sampling_options: SamplingOptions | None = None
+    aggregates: list[Aggregate] | None = None
+    join_columns: list[str] | None = None
+    jdbc_reader_options: JdbcReaderOptions | None = None
+    select_columns: list[str] | None = None
+    drop_columns: list[str] | None = None
+    column_mapping: list[ColumnMapping] | None = None
+    transformations: list[Transformation] | None = None
+    column_thresholds: list[ColumnThresholds] | None = None
+    filters: Filters | None = None
+    table_thresholds: list[TableThresholds] | None = None
+    def __post_init__(self):
+        self.source_name = self.source_name.lower()
+        self.target_name = self.target_name.lower()
+        self.select_columns = to_lower_case(self.select_columns) if self.select_columns else None
+        self.drop_columns = to_lower_case(self.drop_columns) if self.drop_columns else None
+        self.join_columns = to_lower_case(self.join_columns) if self.join_columns else None
+    @property
+    def to_src_col_map(self):
+        if self.column_mapping:
+            return {c.source_name: c.target_name for c in self.column_mapping}
+        return None
+    @property
+    def to_tgt_col_map(self):
+        if self.column_mapping:
+            return {c.target_name: c.source_name for c in self.column_mapping}
+        return None
+    def get_src_to_tgt_col_mapping_list(self, cols: list[str], layer: str) -> set[str]:
+        if layer == "source":
+            return set(cols)
+        if self.to_src_col_map:
+            return {self.to_src_col_map.get(col, col) for col in cols}
+        return set(cols)
+    def get_layer_src_to_tgt_col_mapping(self, column_name: str, layer: str) -> str:
+        if layer == "source":
+            return column_name
+        if self.to_src_col_map:
+            return self.to_src_col_map.get(column_name, column_name)
+        return column_name
+    def get_tgt_to_src_col_mapping_list(self, cols: list[str] | set[str]) -> set[str]:
+        if self.to_tgt_col_map:
+            return {self.to_tgt_col_map.get(col, col) for col in cols}
+        return set(cols)
+    def get_layer_tgt_to_src_col_mapping(self, column_name: str, layer: str) -> str:
+        if layer == "source":
+            return column_name
+        if self.to_tgt_col_map:
+            return self.to_tgt_col_map.get(column_name, column_name)
+        return column_name
+    def get_select_columns(self, schema: list[Schema], layer: str) -> set[str]:
+        if self.select_columns is None:
+            return {sch.column_name for sch in schema}
+        if self.to_src_col_map:
+            return self.get_src_to_tgt_col_mapping_list(self.select_columns, layer)
+        return set(self.select_columns)
+    def get_threshold_columns(self, layer: str) -> set[str]:
+        if self.column_thresholds is None:
+            return set()
+        return {self.get_layer_src_to_tgt_col_mapping(thresh.column_name, layer) for thresh in self.column_thresholds}
+    def get_join_columns(self, layer: str) -> set[str] | None:
+        if self.join_columns is None:
+            return None
+        return {self.get_layer_src_to_tgt_col_mapping(col, layer) for col in self.join_columns}
+    def get_drop_columns(self, layer: str) -> set[str]:
+        if self.drop_columns is None:
+            return set()
+        return {self.get_layer_src_to_tgt_col_mapping(col, layer) for col in self.drop_columns}
+    def get_transformation_dict(self, layer: str) -> dict[str, str]:
+        if self.transformations:
+            if layer == "source":
+                return {
+                    trans.column_name: (trans.source if trans.source else trans.column_name)
+                    for trans in self.transformations
+                }
+            return {
+                self.get_layer_src_to_tgt_col_mapping(trans.column_name, layer): (
+                    trans.target if trans.target else self.get_layer_src_to_tgt_col_mapping(trans.column_name, layer)
+                )
+                for trans in self.transformations
+            }
+        return {}
+    def get_partition_column(self, layer: str) -> set[str]:
+        if self.jdbc_reader_options and layer == "source":
+            if self.jdbc_reader_options.partition_column:
+                return {self.jdbc_reader_options.partition_column}
+        return set()
+    def get_filter(self, layer: str) -> str | None:
+        if self.filters is None:
+            return None
+        if layer == "source":
+            return self.filters.source
+        return self.filters.target
+@dataclass
+class Schema:
+    column_name: str
+    data_type: str
+@dataclass
+class Aggregate:
+    agg_columns: list[str]
+    type: str
+    group_by_columns: list[str] | None = None
+    def __post_init__(self):
+        self.agg_columns = to_lower_case(self.agg_columns)
+        self.type = self.type.lower()
+        self.group_by_columns = to_lower_case(self.group_by_columns) if self.group_by_columns else None
+        assert (
+            self.type in _SUPPORTED_AGG_TYPES
+        ), f"Invalid aggregate type: {self.type}, only {_SUPPORTED_AGG_TYPES} are supported."
+    def get_agg_type(self):
+        return self.type
+    @classmethod
+    def _join_columns(cls, columns: list[str]):
+        return "+__+".join(columns)
+    @property
+    def group_by_columns_as_str(self):
+        return self._join_columns(self.group_by_columns) if self.group_by_columns else "NA"
+    @property
+    def agg_columns_as_str(self):
+        return self._join_columns(self.agg_columns)
+@dataclass
+class AggregateRule:
+    agg_type: str
+    agg_column: str
+    group_by_columns: list[str] | None
+    group_by_columns_as_str: str
+    rule_type: str = "AGGREGATE"
+    @property
+    def column_from_rule(self):
+        # creates rule_column. e.g., min_col1_grp1_grp2
+        return f"{self.agg_type}_{self.agg_column}_{self.group_by_columns_as_str}"
+    @property
+    def group_by_columns_as_table_column(self):
+        # If group_by_columns are not defined, store is as null
+        group_by_cols_as_table_col = "NULL"
+        if self.group_by_columns:
+            # Sort the columns, convert to lower case and create a string:  , e.g., grp1, grp2
+            formatted_cols = ", ".join([f"{col.lower()}" for col in sorted(self.group_by_columns)])
+            group_by_cols_as_table_col = f"\"{formatted_cols}\""
+        return group_by_cols_as_table_col
+    def get_rule_query(self, rule_id):
+        rule_info = f""" map( 'agg_type', '{self.agg_type}',
+                 'agg_column', '{self.agg_column}',
+                 'group_by_columns', {self.group_by_columns_as_table_column}
+                 )
+        """
+        return f" SELECT {rule_id} as rule_id, " f" '{self.rule_type}' as rule_type, " f" {rule_info} as rule_info "
+@dataclass
+class AggregateQueryRules:
+    layer: str
+    group_by_columns: list[str] | None
+    group_by_columns_as_str: str
+    query: str
+    rules: list[AggregateRule]

databricks/labs/lakebridge/reconcile/recon_output_config.py ADDED Viewed

@@ -0,0 +1,85 @@
+from dataclasses import dataclass, field
+from pyspark.sql import DataFrame
+from databricks.labs.lakebridge.reconcile.recon_config import AggregateRule
+@dataclass
+class MismatchOutput:
+    mismatch_df: DataFrame | None = None
+    mismatch_columns: list[str] | None = None
+@dataclass
+class ThresholdOutput:
+    threshold_df: DataFrame | None = None
+    threshold_mismatch_count: int = 0
+@dataclass
+class DataReconcileOutput:
+    mismatch_count: int = 0
+    missing_in_src_count: int = 0
+    missing_in_tgt_count: int = 0
+    mismatch: MismatchOutput = field(default_factory=MismatchOutput)
+    missing_in_src: DataFrame | None = None
+    missing_in_tgt: DataFrame | None = None
+    threshold_output: ThresholdOutput = field(default_factory=ThresholdOutput)
+    exception: str | None = None
+@dataclass
+class SchemaMatchResult:
+    source_column: str
+    source_datatype: str
+    databricks_column: str
+    databricks_datatype: str
+    is_valid: bool = True
+@dataclass
+class SchemaReconcileOutput:
+    is_valid: bool
+    compare_df: DataFrame | None = None
+    exception: str | None = None
+@dataclass
+class ReconcileProcessDuration:
+    start_ts: str
+    end_ts: str | None
+@dataclass
+class StatusOutput:
+    row: bool | None = None
+    column: bool | None = None
+    schema: bool | None = None
+    aggregate: bool | None = None
+@dataclass
+class ReconcileTableOutput:
+    target_table_name: str
+    source_table_name: str
+    status: StatusOutput = field(default_factory=StatusOutput)
+    exception_message: str | None = None
+@dataclass
+class ReconcileOutput:
+    recon_id: str
+    results: list[ReconcileTableOutput]
+@dataclass
+class ReconcileRecordCount:
+    source: int = 0
+    target: int = 0
+@dataclass
+class AggregateQueryOutput:
+    rule: AggregateRule | None
+    reconcile_output: DataReconcileOutput

databricks/labs/lakebridge/reconcile/runner.py ADDED Viewed

@@ -0,0 +1,97 @@
+import logging
+import webbrowser
+from databricks.labs.blueprint.installation import Installation
+from databricks.labs.blueprint.installation import SerdeError
+from databricks.labs.blueprint.installer import InstallState
+from databricks.labs.blueprint.tui import Prompts
+from databricks.sdk import WorkspaceClient
+from databricks.sdk.errors import NotFound, PermissionDenied
+from databricks.labs.lakebridge.config import ReconcileConfig, TableRecon
+from databricks.labs.lakebridge.deployment.recon import RECON_JOB_NAME
+from databricks.labs.lakebridge.reconcile.recon_config import RECONCILE_OPERATION_NAME
+logger = logging.getLogger(__name__)
+_RECON_README_URL = "https://github.com/databrickslabs/lakebridge/blob/main/docs/recon_configurations/README.md"
+class ReconcileRunner:
+    def __init__(
+        self,
+        ws: WorkspaceClient,
+        installation: Installation,
+        install_state: InstallState,
+        prompts: Prompts,
+    ):
+        self._ws = ws
+        self._installation = installation
+        self._install_state = install_state
+        self._prompts = prompts
+    def run(self, operation_name=RECONCILE_OPERATION_NAME):
+        reconcile_config = self._get_verified_recon_config()
+        job_id = self._get_recon_job_id(reconcile_config)
+        logger.info(f"Triggering the reconcile job with job_id: `{job_id}`")
+        wait = self._ws.jobs.run_now(job_id, job_parameters={"operation_name": operation_name})
+        if not wait.run_id:
+            raise SystemExit(f"Job {job_id} execution failed. Please check the job logs for more details.")
+        job_run_url = f"{self._ws.config.host}/jobs/{job_id}/runs/{wait.run_id}"
+        logger.info(
+            f"'{operation_name.upper()}' job started. Please check the job_url `{job_run_url}` for the current status."
+        )
+        if self._prompts.confirm(f"Would you like to open the job run URL `{job_run_url}` in the browser?"):
+            webbrowser.open(job_run_url)
+    def _get_verified_recon_config(self) -> ReconcileConfig:
+        try:
+            recon_config = self._installation.load(ReconcileConfig)
+        except NotFound as err:
+            raise SystemExit("Cannot find existing `reconcile` installation. Please try reinstalling.") from err
+        except (PermissionDenied, SerdeError, ValueError, AttributeError) as e:
+            install_dir = self._installation.install_folder()
+            raise SystemExit(
+                f"Existing `reconcile` installation at {install_dir} is corrupted. Please try reinstalling."
+            ) from e
+        self._verify_recon_table_config(recon_config)
+        return recon_config
+    def _verify_recon_table_config(self, recon_config):
+        source_catalog_or_schema = (
+            recon_config.database_config.source_catalog
+            if recon_config.database_config.source_catalog
+            else recon_config.database_config.source_schema
+        )
+        # Filename pattern for recon table config `recon_config_<SOURCE>_<CATALOG_OR_SCHEMA>_<FILTER_TYPE>.json`
+        # Example: recon_config_snowflake_sample_data_all.json
+        filename = f"recon_config_{recon_config.data_source}_{source_catalog_or_schema}_{recon_config.report_type}.json"
+        try:
+            logger.debug(f"Loading recon table config `{filename}` from workspace.")
+            self._installation.load(TableRecon, filename=filename)
+        except NotFound as e:
+            err_msg = (
+                "Cannot find recon table configuration in existing `reconcile` installation. "
+                f"Please provide the configuration file {filename} in the workspace."
+            )
+            logger.error(f"{err_msg}. For more details, please refer to {_RECON_README_URL}")
+            raise SystemExit(err_msg) from e
+        except (PermissionDenied, SerdeError, ValueError, AttributeError) as e:
+            install_dir = self._installation.install_folder()
+            err_msg = (
+                f"Cannot load corrupted recon table configuration from {install_dir}/{filename}. "
+                f"Please validate the file."
+            )
+            logger.error(f"{err_msg}. For more details, please refer to {_RECON_README_URL}")
+            raise SystemExit(err_msg) from e
+    def _get_recon_job_id(self, reconcile_config: ReconcileConfig) -> int:
+        if reconcile_config.job_id:
+            logger.debug("Reconcile job id found in the reconcile config.")
+            return int(reconcile_config.job_id)
+        if RECON_JOB_NAME in self._install_state.jobs:
+            logger.debug("Reconcile job id found in the install state.")
+            return int(self._install_state.jobs[RECON_JOB_NAME])
+        raise SystemExit("Reconcile Job ID not found. Please try reinstalling.")