PyPI - databricks-labs-lakebridge - Versions diffs - 0.10.6__py3-none-any.whl → 0.10.8__py3-none-any.whl - Mend

databricks-labs-lakebridge 0.10.6py3-none-any.whl → 0.10.8py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (46) hide show

databricks/labs/lakebridge/reconcile/trigger_recon_service.py ADDED Viewed

@@ -0,0 +1,256 @@
+import logging
+from datetime import datetime
+from uuid import uuid4
+from pyspark.errors import PySparkException
+from pyspark.sql import SparkSession
+from databricks.sdk import WorkspaceClient
+from databricks.labs.lakebridge.config import ReconcileConfig, TableRecon, DatabaseConfig
+from databricks.labs.lakebridge.reconcile import utils
+from databricks.labs.lakebridge.reconcile.connectors.data_source import DataSource
+from databricks.labs.lakebridge.reconcile.exception import DataSourceRuntimeException, ReconciliationException
+from databricks.labs.lakebridge.reconcile.recon_capture import (
+    ReconCapture,
+    ReconIntermediatePersist,
+    generate_final_reconcile_output,
+)
+from databricks.labs.lakebridge.reconcile.recon_config import Table, Schema
+from databricks.labs.lakebridge.reconcile.recon_output_config import (
+    ReconcileOutput,
+    ReconcileProcessDuration,
+    SchemaReconcileOutput,
+    DataReconcileOutput,
+)
+from databricks.labs.lakebridge.reconcile.reconciliation import Reconciliation
+from databricks.labs.lakebridge.reconcile.schema_compare import SchemaCompare
+from databricks.labs.lakebridge.reconcile.normalize_recon_config_service import NormalizeReconConfigService
+from databricks.labs.lakebridge.transpiler.execute import verify_workspace_client
+from databricks.labs.lakebridge.transpiler.sqlglot.dialect_utils import get_dialect
+logger = logging.getLogger(__name__)
+_RECON_REPORT_TYPES = {"schema", "data", "row", "all", "aggregate"}
+class TriggerReconService:
+    @staticmethod
+    def trigger_recon(
+        ws: WorkspaceClient,
+        spark: SparkSession,
+        table_recon: TableRecon,
+        reconcile_config: ReconcileConfig,
+        local_test_run: bool = False,
+    ) -> ReconcileOutput:
+        reconciler, recon_capture = TriggerReconService.create_recon_dependencies(
+            ws, spark, reconcile_config, local_test_run
+        )
+        for table_conf in table_recon.tables:
+            TriggerReconService.recon_one(spark, reconciler, recon_capture, reconcile_config, table_conf)
+        return TriggerReconService.verify_successful_reconciliation(
+            generate_final_reconcile_output(
+                recon_id=recon_capture.recon_id,
+                spark=spark,
+                metadata_config=reconcile_config.metadata_config,
+                local_test_run=local_test_run,
+            )
+        )
+    @staticmethod
+    def create_recon_dependencies(
+        ws: WorkspaceClient, spark: SparkSession, reconcile_config: ReconcileConfig, local_test_run: bool = False
+    ) -> tuple[Reconciliation, ReconCapture]:
+        ws_client: WorkspaceClient = verify_workspace_client(ws)
+        # validate the report type
+        report_type = reconcile_config.report_type.lower()
+        logger.info(f"report_type: {report_type}, data_source: {reconcile_config.data_source} ")
+        utils.validate_input(report_type, _RECON_REPORT_TYPES, "Invalid report type")
+        source, target = utils.initialise_data_source(
+            engine=reconcile_config.data_source,
+            spark=spark,
+            ws=ws_client,
+            secret_scope=reconcile_config.secret_scope,
+        )
+        recon_id = str(uuid4())
+        # initialise the Reconciliation
+        reconciler = Reconciliation(
+            source,
+            target,
+            reconcile_config.database_config,
+            report_type,
+            SchemaCompare(spark=spark),
+            get_dialect(reconcile_config.data_source),
+            spark,
+            metadata_config=reconcile_config.metadata_config,
+        )
+        recon_capture = ReconCapture(
+            database_config=reconcile_config.database_config,
+            recon_id=recon_id,
+            report_type=report_type,
+            source_dialect=get_dialect(reconcile_config.data_source),
+            ws=ws_client,
+            spark=spark,
+            metadata_config=reconcile_config.metadata_config,
+            local_test_run=local_test_run,
+        )
+        return reconciler, recon_capture
+    @staticmethod
+    def recon_one(
+        spark: SparkSession,
+        reconciler: Reconciliation,
+        recon_capture: ReconCapture,
+        reconcile_config: ReconcileConfig,
+        table_conf: Table,
+    ):
+        normalized_table_conf = NormalizeReconConfigService(
+            reconciler.source, reconciler.target
+        ).normalize_recon_table_config(table_conf)
+        schema_reconcile_output, data_reconcile_output, recon_process_duration = TriggerReconService._do_recon_one(
+            reconciler, reconcile_config, normalized_table_conf
+        )
+        TriggerReconService.persist_delta_table(
+            spark,
+            reconciler,
+            recon_capture,
+            schema_reconcile_output,
+            data_reconcile_output,
+            reconcile_config,
+            normalized_table_conf,
+            recon_process_duration,
+        )
+    @staticmethod
+    def _do_recon_one(reconciler: Reconciliation, reconcile_config: ReconcileConfig, table_conf: Table):
+        recon_process_duration = ReconcileProcessDuration(start_ts=str(datetime.now()), end_ts=None)
+        schema_reconcile_output = SchemaReconcileOutput(is_valid=True)
+        data_reconcile_output = DataReconcileOutput()
+        try:
+            src_schema, tgt_schema = TriggerReconService.get_schemas(
+                reconciler.source, reconciler.target, table_conf, reconcile_config.database_config, True
+            )
+        except DataSourceRuntimeException as e:
+            schema_reconcile_output = SchemaReconcileOutput(is_valid=False, exception=str(e))
+        else:
+            if reconciler.report_type in {"schema", "all"}:
+                schema_reconcile_output = TriggerReconService._run_reconcile_schema(
+                    reconciler=reconciler,
+                    table_conf=table_conf,
+                    src_schema=src_schema,
+                    tgt_schema=tgt_schema,
+                )
+                logger.warning("Schema comparison is completed.")
+            if reconciler.report_type in {"data", "row", "all"}:
+                data_reconcile_output = TriggerReconService._run_reconcile_data(
+                    reconciler=reconciler,
+                    table_conf=table_conf,
+                    src_schema=src_schema,
+                    tgt_schema=tgt_schema,
+                )
+                logger.warning(f"Reconciliation for '{reconciler.report_type}' report completed.")
+        recon_process_duration.end_ts = str(datetime.now())
+        return schema_reconcile_output, data_reconcile_output, recon_process_duration
+    @staticmethod
+    def get_schemas(
+        source: DataSource,
+        target: DataSource,
+        table_conf: Table,
+        database_config: DatabaseConfig,
+        normalize: bool,
+    ) -> tuple[list[Schema], list[Schema]]:
+        src_schema = source.get_schema(
+            catalog=database_config.source_catalog,
+            schema=database_config.source_schema,
+            table=table_conf.source_name,
+            normalize=normalize,
+        )
+        tgt_schema = target.get_schema(
+            catalog=database_config.target_catalog,
+            schema=database_config.target_schema,
+            table=table_conf.target_name,
+            normalize=normalize,
+        )
+        return src_schema, tgt_schema
+    @staticmethod
+    def _run_reconcile_schema(
+        reconciler: Reconciliation,
+        table_conf: Table,
+        src_schema: list[Schema],
+        tgt_schema: list[Schema],
+    ):
+        try:
+            return reconciler.reconcile_schema(table_conf=table_conf, src_schema=src_schema, tgt_schema=tgt_schema)
+        except PySparkException as e:
+            return SchemaReconcileOutput(is_valid=False, exception=str(e))
+    @staticmethod
+    def _run_reconcile_data(
+        reconciler: Reconciliation,
+        table_conf: Table,
+        src_schema: list[Schema],
+        tgt_schema: list[Schema],
+    ) -> DataReconcileOutput:
+        try:
+            return reconciler.reconcile_data(table_conf=table_conf, src_schema=src_schema, tgt_schema=tgt_schema)
+        except DataSourceRuntimeException as e:
+            return DataReconcileOutput(exception=str(e))
+    @staticmethod
+    def persist_delta_table(
+        spark: SparkSession,
+        reconciler: Reconciliation,
+        recon_capture: ReconCapture,
+        schema_reconcile_output: SchemaReconcileOutput,
+        data_reconcile_output: DataReconcileOutput,
+        reconcile_config: ReconcileConfig,
+        table_conf: Table,
+        recon_process_duration: ReconcileProcessDuration,
+    ):
+        recon_capture.start(
+            data_reconcile_output=data_reconcile_output,
+            schema_reconcile_output=schema_reconcile_output,
+            table_conf=table_conf,
+            recon_process_duration=recon_process_duration,
+            record_count=reconciler.get_record_count(table_conf, reconciler.report_type),
+        )
+        if reconciler.report_type != "schema":
+            ReconIntermediatePersist(
+                spark=spark, path=utils.generate_volume_path(table_conf, reconcile_config.metadata_config)
+            ).clean_unmatched_df_from_volume()
+    @staticmethod
+    def verify_successful_reconciliation(
+        reconcile_output: ReconcileOutput, operation_name: str = "reconcile"
+    ) -> ReconcileOutput:
+        for table_output in reconcile_output.results:
+            if table_output.exception_message or (
+                table_output.status.column is False
+                or table_output.status.row is False
+                or table_output.status.schema is False
+                or table_output.status.aggregate is False
+            ):
+                raise ReconciliationException(
+                    f" Reconciliation failed for one or more tables. Please check the recon metrics for more details."
+                    f" **{operation_name}** failed.",
+                    reconcile_output=reconcile_output,
+                )
+        logger.info("Reconciliation completed successfully.")
+        return reconcile_output

databricks/labs/lakebridge/reconcile/utils.py ADDED Viewed

@@ -0,0 +1,38 @@
+import logging
+from pyspark.sql import SparkSession
+from databricks.sdk import WorkspaceClient
+from databricks.labs.lakebridge.config import ReconcileMetadataConfig
+from databricks.labs.lakebridge.reconcile.connectors.source_adapter import create_adapter
+from databricks.labs.lakebridge.reconcile.exception import InvalidInputException
+from databricks.labs.lakebridge.reconcile.recon_config import Table
+from databricks.labs.lakebridge.transpiler.sqlglot.dialect_utils import get_dialect
+logger = logging.getLogger(__name__)
+def initialise_data_source(
+    ws: WorkspaceClient,
+    spark: SparkSession,
+    engine: str,
+    secret_scope: str,
+):
+    source = create_adapter(engine=get_dialect(engine), spark=spark, ws=ws, secret_scope=secret_scope)
+    target = create_adapter(engine=get_dialect("databricks"), spark=spark, ws=ws, secret_scope=secret_scope)
+    return source, target
+def validate_input(input_value: str, list_of_value: set, message: str):
+    if input_value not in list_of_value:
+        error_message = f"{message} --> {input_value} is not one of {list_of_value}"
+        logger.error(error_message)
+        raise InvalidInputException(error_message)
+def generate_volume_path(table_conf: Table, metadata_config: ReconcileMetadataConfig):
+    catalog = metadata_config.catalog
+    schema = metadata_config.schema
+    return f"/Volumes/{catalog}/{schema}/{metadata_config.volume}/{table_conf.source_name}_{table_conf.target_name}/"

databricks/labs/lakebridge/transpiler/execute.py CHANGED Viewed

@@ -48,6 +48,26 @@ class TranspilingContext:
     transpiled_code: str | None = None
+def _validate_transpiled_sql(context: TranspilingContext, content: str, error_list: list[TranspileError]) -> str:
+    if context.validator is None:
+        return content
+    validation_result = _validation(context.validator, context.config, str(content))
+    # Potentially expensive, only evaluate if debug is enabled
+    if logger.isEnabledFor(logging.DEBUG):
+        msg = f"Finished validating transpiled code for file: {context.input_path} (result: {validation_result})"
+        logger.debug(msg)
+    if validation_result.exception_msg is not None:
+        error = TranspileError(
+            "VALIDATION_ERROR",
+            ErrorKind.VALIDATION,
+            ErrorSeverity.WARNING,
+            context.input_path,
+            validation_result.exception_msg,
+        )
+        error_list.append(error)
+    return validation_result.validated_sql
 async def _process_one_file(context: TranspilingContext) -> tuple[int, list[TranspileError]]:
     input_path = context.input_path
@@ -89,29 +109,29 @@ async def _process_one_file(context: TranspilingContext) -> tuple[int, list[Tran
     assert output_path is not None, "Output path must be set in the context"
     output_path.parent.mkdir(exist_ok=True)
-    if _is_combined_result(transpile_result):
-        _process_combined_result(context, error_list)
+    if _is_mime_result(transpile_result):
+        _process_mime_result(context, error_list)
     else:
-        _process_single_result(context, error_list)
+        _process_non_mime_result(context, error_list)
     return transpile_result.success_count, error_list
-def _is_combined_result(result: TranspileResult):
+def _is_mime_result(result: TranspileResult):
     return result.transpiled_code.startswith("Content-Type: multipart/mixed; boundary=")
-def _process_combined_result(context: TranspilingContext, _error_list: list[TranspileError]) -> None:
+def _process_mime_result(context: TranspilingContext, error_list: list[TranspileError]) -> None:
     # TODO error handling
     # Added policy to process quoted-printable encoded
     parser = EmailParser(policy=policy.default)
     transpiled_code: str = cast(str, context.transpiled_code)
     message: Message = parser.parsestr(transpiled_code)
     for part in message.walk():
-        _process_combined_part(context, part)
+        _process_combined_part(context, part, error_list)
-def _process_combined_part(context: TranspilingContext, part: Message) -> None:
+def _process_combined_part(context: TranspilingContext, part: Message, error_list: list[TranspileError]) -> None:
     if part.get_content_type() != "text/plain":
         return  # TODO Need to handle other content types, e.g., text/binary, application/json, etc.
     filename = part.get_filename()
@@ -133,35 +153,21 @@ def _process_combined_part(context: TranspilingContext, part: Message) -> None:
         folder.mkdir(parents=True, exist_ok=True)
     output = folder / segments[-1]
     logger.debug(f"Writing output to: {output}")
+    # Only validate if output file has .sql suffix
+    if output.suffix == ".sql":
+        content = _validate_transpiled_sql(context, content, error_list)
     output.write_text(content)
-def _process_single_result(context: TranspilingContext, error_list: list[TranspileError]) -> None:
+def _process_non_mime_result(context: TranspilingContext, error_list: list[TranspileError]) -> None:
     output_code: str = context.transpiled_code or ""
+    output_path = cast(Path, context.output_path)
     if any(err.kind == ErrorKind.PARSING for err in error_list):
         output_code = context.source_code or ""
-    elif context.validator:
-        logger.debug(f"Validating transpiled code for file: {context.input_path}")
-        validation_result = _validation(context.validator, context.config, str(context.transpiled_code))
-        # Potentially expensive, only evaluate if debug is enabled
-        if logger.isEnabledFor(logging.DEBUG):
-            msg = f"Finished validating transpiled code for file: {context.input_path} (result: {validation_result})"
-            logger.debug(msg)
-        if validation_result.exception_msg is not None:
-            error = TranspileError(
-                "VALIDATION_ERROR",
-                ErrorKind.VALIDATION,
-                ErrorSeverity.WARNING,
-                context.input_path,
-                validation_result.exception_msg,
-            )
-            error_list.append(error)
-        output_code = validation_result.validated_sql
-    output_path = cast(Path, context.output_path)
+    elif output_path.suffix == ".sql":
+        output_code = _validate_transpiled_sql(context, output_code, error_list)
     with output_path.open("w") as w:
         # The above adds a java-style comment block at the top of the output file
         # This would break .py or .json outputs so we disable it for now.

databricks-labs-lakebridge 0.10.6__py3-none-any.whl → 0.10.8__py3-none-any.whl

databricks-labs-lakebridge 0.10.6py3-none-any.whl → 0.10.8py3-none-any.whl