PyPI - databricks-labs-lakebridge - Versions diffs - 0.10.6__py3-none-any.whl → 0.10.8__py3-none-any.whl - Mend

databricks-labs-lakebridge 0.10.6py3-none-any.whl → 0.10.8py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (46) hide show

databricks/labs/lakebridge/__about__.py CHANGED Viewed

@@ -1,2 +1,2 @@
 # DO NOT MODIFY THIS FILE
-__version__ = "0.10.6"
+__version__ = "0.10.8"

databricks/labs/lakebridge/analyzer/__init__.py ADDED Viewed

File without changes

databricks/labs/lakebridge/analyzer/lakebridge_analyzer.py ADDED Viewed

@@ -0,0 +1,95 @@
+import tempfile
+from pathlib import Path
+from databricks.sdk.service.iam import User
+from databricks.sdk.core import with_user_agent_extra
+from databricks.labs.blueprint.entrypoint import get_logger
+from databricks.labs.blueprint.tui import Prompts
+from databricks.labs.bladespector.analyzer import Analyzer, _PLATFORM_TO_SOURCE_TECHNOLOGY
+from databricks.labs.lakebridge.helpers.telemetry_utils import make_alphanum_or_semver
+from databricks.labs.lakebridge.helpers.file_utils import check_path, move_tmp_file
+logger = get_logger(__file__)
+class LakebridgeAnalyzer(Analyzer):
+    def __init__(self, current_user: User, prompts: Prompts, is_debug: bool = False):
+        self._current_user = current_user
+        self._prompts = prompts
+        self._is_debug = is_debug
+        super().__init__()
+    def _get_source_directory(self) -> Path:
+        """Get and validate the source directory from user input."""
+        directory_str = self._prompts.question(
+            "Enter full path to the source directory",
+            default=Path.cwd().as_posix(),
+            validate=check_path,
+        )
+        return Path(directory_str).resolve()
+    def _get_result_file_path(self, directory: Path) -> Path:
+        """Get the result file path - accepts either filename or full path."""
+        filename = self._prompts.question(
+            "Enter report file name or custom export path including file name without extension",
+            default=f"{directory.as_posix()}/lakebridge-analyzer-results.xlsx",
+            validate=check_path,
+        )
+        return directory / Path(filename) if len(filename.split("/")) == 1 else Path(filename)
+    def _get_source_tech(self, platform: str | None = None) -> str:
+        """Validate source technology or prompt for a valid source"""
+        if platform is None or platform not in self.supported_source_technologies():
+            if platform is not None:
+                logger.warning(f"Invalid source technology {platform}")
+            platform = self._prompts.choice("Select the source technology", self.supported_source_technologies())
+            with_user_agent_extra("analyzer_source_tech", make_alphanum_or_semver(platform))
+            logger.debug(f"User: {self._current_user}")
+        return _PLATFORM_TO_SOURCE_TECHNOLOGY[platform]
+    @staticmethod
+    def _temp_xlsx_path(results_dir: Path | str) -> Path:
+        return (Path(tempfile.mkdtemp()) / Path(results_dir).name).with_suffix(".xlsx")
+    def _run_prompt_analyzer(self):
+        """Run the analyzer: prompt guided"""
+        source_dir = self._get_source_directory()
+        results_dir = self._get_result_file_path(source_dir)
+        tmp_dir = self._temp_xlsx_path(results_dir)
+        technology = self._get_source_tech()
+        self._run_binary(source_dir, tmp_dir, technology, self._is_debug)
+        move_tmp_file(tmp_dir, results_dir)
+        logger.info(f"Successfully Analyzed files in ${source_dir} for ${technology} and saved report to {results_dir}")
+    def _run_arg_analyzer(self, source_dir: str | None, results_dir: str | None, technology: str | None):
+        """Run the analyzer: arg guided"""
+        if source_dir is None or results_dir is None or technology is None:
+            logger.error("All arguments (--source-directory, --report-file, --source-tech) must be provided")
+            return
+        if check_path(source_dir) and check_path(results_dir):
+            tmp_dir = self._temp_xlsx_path(results_dir)
+            technology = self._get_source_tech(technology)
+            self._run_binary(Path(source_dir), tmp_dir, technology, self._is_debug)
+            move_tmp_file(tmp_dir, Path(results_dir))
+            logger.info(
+                f"Successfully Analyzed files in ${source_dir} for ${technology} and saved report to {results_dir}"
+            )
+    def run_analyzer(
+        self, source_dir: str | None = None, results_dir: str | None = None, technology: str | None = None
+    ):
+        """Run the analyzer."""
+        if not any([source_dir, results_dir, technology]):
+            self._run_prompt_analyzer()
+            return
+        self._run_arg_analyzer(source_dir, results_dir, technology)

databricks/labs/lakebridge/assessments/profiler_validator.py ADDED Viewed

@@ -0,0 +1,103 @@
+import os
+from dataclasses import dataclass
+from duckdb import DuckDBPyConnection
+from databricks.labs.lakebridge.assessments.pipeline import PipelineClass
+PROFILER_DB_NAME = "profiler_extract.db"
+@dataclass(frozen=True)
+class ValidationOutcome:
+    """A data class that holds the outcome of a table validation check."""
+    table: str
+    column: str | None
+    strategy: str
+    outcome: str
+    severity: str
+class ValidationStrategy:
+    """Abstract class for validating a Profiler table"""
+    def validate(self, connection: DuckDBPyConnection) -> ValidationOutcome:
+        raise NotImplementedError
+class NullValidationCheck(ValidationStrategy):
+    """Concrete class for validating null values in a profiler table"""
+    def __init__(self, table, column, severity="WARN"):
+        self.name = self.__class__.__name__
+        self.table = table
+        self.column = column
+        self.severity = severity
+    def validate(self, connection: DuckDBPyConnection) -> ValidationOutcome:
+        """
+        Validates that a column does not contain null values.
+        input:
+          connection: a DuckDB connection object
+        """
+        result = connection.execute(f"SELECT COUNT(*) FROM {self.table} WHERE {self.column} IS NULL").fetchone()
+        if result:
+            row_count = result[0]
+            outcome = "FAIL" if row_count > 0 else "PASS"
+        else:
+            outcome = "FAIL"
+        return ValidationOutcome(self.table, self.column, self.name, outcome, self.severity)
+class EmptyTableValidationCheck(ValidationStrategy):
+    """Concrete class for validating empty tables from a profiler run."""
+    def __init__(self, table, severity="WARN"):
+        self.name = self.__class__.__name__
+        self.table = table
+        self.severity = severity
+    def validate(self, connection) -> ValidationOutcome:
+        """Validates that a table is not empty.
+        input:
+          connection: a DuckDB connection object
+        returns:
+          a ValidationOutcome object
+        """
+        result = connection.execute(f"SELECT COUNT(*) FROM {self.table}").fetchone()
+        if result:
+            row_count = result[0]
+            outcome = "PASS" if row_count > 0 else "FAIL"
+        else:
+            outcome = "FAIL"
+        return ValidationOutcome(self.table, None, self.name, outcome, self.severity)
+def get_profiler_extract_path(pipeline_config_path: str) -> str:
+    """
+    Returns the filesystem path of the profiler extract database.
+    input:
+       pipeline_config_path: the location of the pipeline definition .yml file
+    returns:
+       the filesystem path to the profiler extract database
+    """
+    pipeline_config = PipelineClass.load_config_from_yaml(pipeline_config_path)
+    normalized_db_path = os.path.normpath(pipeline_config.extract_folder)
+    database_path = f"{normalized_db_path}/{PROFILER_DB_NAME}"
+    return database_path
+def build_validation_report(
+    validations: list[ValidationStrategy], connection: DuckDBPyConnection
+) -> list[ValidationOutcome]:
+    """
+    Builds a list of ValidationOutcomes from list of validation checks.
+    input:
+      validations: a list of ValidationStrategy objects
+      connection: a DuckDB connection object
+    returns: a list of ValidationOutcomes
+    """
+    validation_report = []
+    for validation in validations:
+        validation_report.append(validation.validate(connection))
+    return validation_report

databricks/labs/lakebridge/base_install.py CHANGED Viewed

@@ -1,13 +1,30 @@
 from databricks.labs.blueprint.logger import install_logger
 from databricks.labs.blueprint.entrypoint import get_logger
+from databricks.sdk import WorkspaceClient
 from databricks.sdk.core import with_user_agent_extra
-install_logger()
-with_user_agent_extra("cmd", "install")
+from databricks.labs.lakebridge import __version__
+from databricks.labs.lakebridge.install import installer as _installer
+from databricks.labs.lakebridge.transpiler.repository import TranspilerRepository
+def main() -> None:
+    install_logger()
+    with_user_agent_extra("cmd", "install")
-if __name__ == "__main__":
     logger = get_logger(__file__)
     logger.setLevel("INFO")
+    installer = _installer(
+        WorkspaceClient(product="lakebridge", product_version=__version__),
+        transpiler_repository=TranspilerRepository.user_home(),
+    )
+    if not installer.upgrade_installed_transpilers():
+        logger.debug("No existing Lakebridge transpilers detected; assuming fresh installation.")
     logger.info("Successfully Setup Lakebridge Components Locally")
     logger.info("For more information, please visit https://databrickslabs.github.io/lakebridge/")
+if __name__ == "__main__":
+    main()

databricks/labs/lakebridge/cli.py CHANGED Viewed

@@ -19,20 +19,17 @@ from databricks.labs.blueprint.entrypoint import get_logger, is_in_debug
 from databricks.labs.blueprint.installation import RootJsonValue
 from databricks.labs.blueprint.tui import Prompts
-from databricks.labs.bladespector.analyzer import Analyzer
 from databricks.labs.lakebridge.assessments.configure_assessment import (
     create_assessment_configurator,
     PROFILER_SOURCE_SYSTEM,
 )
-from databricks.labs.lakebridge.__about__ import __version__
 from databricks.labs.lakebridge.config import TranspileConfig
 from databricks.labs.lakebridge.contexts.application import ApplicationContext
 from databricks.labs.lakebridge.helpers.recon_config_utils import ReconConfigPrompts
 from databricks.labs.lakebridge.helpers.telemetry_utils import make_alphanum_or_semver
-from databricks.labs.lakebridge.install import WorkspaceInstaller
+from databricks.labs.lakebridge.install import installer
 from databricks.labs.lakebridge.reconcile.runner import ReconcileRunner
 from databricks.labs.lakebridge.lineage import lineage_generator
 from databricks.labs.lakebridge.reconcile.recon_config import RECONCILE_OPERATION_NAME, AGG_RECONCILE_OPERATION_NAME
@@ -52,20 +49,6 @@ def raise_validation_exception(msg: str) -> NoReturn:
     raise ValueError(msg)
-def _installer(ws: WorkspaceClient, transpiler_repository: TranspilerRepository) -> WorkspaceInstaller:
-    app_context = ApplicationContext(_verify_workspace_client(ws))
-    return WorkspaceInstaller(
-        app_context.workspace_client,
-        app_context.prompts,
-        app_context.installation,
-        app_context.install_state,
-        app_context.product_info,
-        app_context.resource_configurator,
-        app_context.workspace_installation,
-        transpiler_repository=transpiler_repository,
-    )
 def _create_warehouse(ws: WorkspaceClient) -> str:
     dbsql = ws.warehouses.create_and_wait(
@@ -89,21 +72,9 @@ def _remove_warehouse(ws: WorkspaceClient, warehouse_id: str):
     logger.info(f"Removed warehouse post installation with id: {warehouse_id}")
-def _verify_workspace_client(ws: WorkspaceClient) -> WorkspaceClient:
-    """
-    [Private] Verifies and updates the workspace client configuration.
-    """
-    # Using reflection to set right value for _product_info for telemetry
-    product_info = getattr(ws.config, '_product_info')
-    if product_info[0] != "lakebridge":
-        setattr(ws.config, '_product_info', ('lakebridge', __version__))
-    return ws
 @lakebridge.command
 def transpile(
+    *,
     w: WorkspaceClient,
     transpiler_config_path: str | None = None,
     source_dialect: str | None = None,
@@ -358,7 +329,7 @@ class _TranspileConfigChecker:
         transpiler_config_path = self._transpiler_repository.transpiler_config_path(transpiler_name)
         logger.info(f"Lakebridge will use the {transpiler_name} transpiler.")
         self._config = dataclasses.replace(self._config, transpiler_config_path=str(transpiler_config_path))
-        return TranspileEngine.load_engine(transpiler_config_path)
+        return LSPEngine.from_config_path(transpiler_config_path)
     def _configure_source_dialect(
         self, source_dialect: str, engine: TranspileEngine | None, msg_prefix: str
@@ -370,6 +341,8 @@ class _TranspileConfigChecker:
                 supported_dialects = ", ".join(self._transpiler_repository.all_dialects())
                 msg = f"{msg_prefix}: {source_dialect!r} (supported dialects: {supported_dialects})"
                 raise_validation_exception(msg)
+            else:
+                self._config = dataclasses.replace(self._config, source_dialect=source_dialect)
         else:
             # Check the source dialect against the engine.
             if source_dialect not in engine.supported_dialects:
@@ -396,6 +369,7 @@ class _TranspileConfigChecker:
                 source_dialect = self._prompts.choice("Select the source dialect:", list(supported_dialects))
         engine = self._configure_transpiler_config_path(source_dialect)
         assert engine is not None, "No transpiler engine available for a supported dialect; configuration is invalid."
+        self._config = dataclasses.replace(self._config, source_dialect=source_dialect)
         return engine
     def _check_lsp_engine(self) -> TranspileEngine:
@@ -426,14 +400,15 @@ class _TranspileConfigChecker:
         #
         # Step 1: Check the transpiler config path.
+        engine: TranspileEngine | None
         transpiler_config_path = self._config.transpiler_config_path
         if transpiler_config_path is not None:
             self._validate_transpiler_config_path(
                 transpiler_config_path,
-                f"Invalid transpiler path configured, path does not exist: {transpiler_config_path}",
+                f"Error: Invalid value for '--transpiler-config-path': '{str(transpiler_config_path)}', file does not exist.",
             )
             path = Path(transpiler_config_path)
-            engine = TranspileEngine.load_engine(path)
+            engine = LSPEngine.from_config_path(path)
         else:
             engine = None
         del transpiler_config_path
@@ -547,7 +522,7 @@ def _override_workspace_client_config(ctx: ApplicationContext, overrides: dict[s
 @lakebridge.command
-def reconcile(w: WorkspaceClient) -> None:
+def reconcile(*, w: WorkspaceClient) -> None:
     """[EXPERIMENTAL] Reconciles source to Databricks datasets"""
     with_user_agent_extra("cmd", "execute-reconcile")
     ctx = ApplicationContext(w)
@@ -563,7 +538,7 @@ def reconcile(w: WorkspaceClient) -> None:
 @lakebridge.command
-def aggregates_reconcile(w: WorkspaceClient) -> None:
+def aggregates_reconcile(*, w: WorkspaceClient) -> None:
     """[EXPERIMENTAL] Reconciles Aggregated source to Databricks datasets"""
     with_user_agent_extra("cmd", "execute-aggregates-reconcile")
     ctx = ApplicationContext(w)
@@ -581,8 +556,8 @@ def aggregates_reconcile(w: WorkspaceClient) -> None:
 @lakebridge.command
 def generate_lineage(
-    w: WorkspaceClient,
     *,
+    w: WorkspaceClient,
     source_dialect: str | None = None,
     input_source: str,
     output_folder: str,
@@ -607,7 +582,7 @@ def generate_lineage(
 @lakebridge.command
-def configure_secrets(w: WorkspaceClient) -> None:
+def configure_secrets(*, w: WorkspaceClient) -> None:
     """Setup reconciliation connection profile details as Secrets on Databricks Workspace"""
     recon_conf = ReconConfigPrompts(w)
@@ -633,24 +608,26 @@ def configure_database_profiler() -> None:
     assessment.run()
-@lakebridge.command()
+@lakebridge.command
 def install_transpile(
+    *,
     w: WorkspaceClient,
     artifact: str | None = None,
     transpiler_repository: TranspilerRepository = TranspilerRepository.user_home(),
 ) -> None:
-    """Install the Lakebridge transpilers"""
+    """Install or upgrade the Lakebridge transpilers."""
     with_user_agent_extra("cmd", "install-transpile")
     if artifact:
         with_user_agent_extra("artifact-overload", Path(artifact).name)
     user = w.current_user
     logger.debug(f"User: {user}")
-    installer = _installer(w, transpiler_repository)
-    installer.run(module="transpile", artifact=artifact)
+    transpile_installer = installer(w, transpiler_repository)
+    transpile_installer.run(module="transpile", artifact=artifact)
 @lakebridge.command(is_unauthenticated=False)
 def configure_reconcile(
+    *,
     w: WorkspaceClient,
     transpiler_repository: TranspilerRepository = TranspilerRepository.user_home(),
 ) -> None:
@@ -662,28 +639,24 @@ def configure_reconcile(
         dbsql_id = _create_warehouse(w)
         w.config.warehouse_id = dbsql_id
     logger.debug(f"Warehouse ID used for configuring reconcile: {w.config.warehouse_id}.")
-    installer = _installer(w, transpiler_repository)
-    installer.run(module="reconcile")
+    reconcile_installer = installer(w, transpiler_repository)
+    reconcile_installer.run(module="reconcile")
-@lakebridge.command()
-def analyze(w: WorkspaceClient, source_directory: str, report_file: str, source_tech: str | None = None) -> None:
+@lakebridge.command
+def analyze(
+    *,
+    w: WorkspaceClient,
+    source_directory: str | None = None,
+    report_file: str | None = None,
+    source_tech: str | None = None,
+):
     """Run the Analyzer"""
     with_user_agent_extra("cmd", "analyze")
     ctx = ApplicationContext(w)
-    prompts = ctx.prompts
-    output_file = report_file
-    input_folder = source_directory
-    if source_tech is None:
-        source_tech = prompts.choice("Select the source technology", Analyzer.supported_source_technologies())
-    with_user_agent_extra("analyzer_source_tech", make_alphanum_or_semver(source_tech))
-    user = ctx.current_user
-    logger.debug(f"User: {user}")
-    is_debug = logger.getEffectiveLevel() == logging.DEBUG
-    Analyzer.analyze(Path(input_folder), Path(output_file), source_tech, is_debug=is_debug)
-    logger.info(
-        f"Successfully Analyzed files in ${source_directory} for ${source_tech} and saved report to {report_file}"
-    )
+    logger.debug(f"User: {ctx.current_user}")
+    ctx.analyzer.run_analyzer(source_directory, report_file, source_tech)
 if __name__ == "__main__":

databricks/labs/lakebridge/contexts/application.py CHANGED Viewed

@@ -12,6 +12,7 @@ from databricks.sdk.config import Config
 from databricks.sdk.errors import NotFound
 from databricks.sdk.service.iam import User
+from databricks.labs.lakebridge.analyzer.lakebridge_analyzer import LakebridgeAnalyzer
 from databricks.labs.lakebridge.config import TranspileConfig, ReconcileConfig, LakebridgeConfiguration
 from databricks.labs.lakebridge.deployment.configurator import ResourceConfigurator
 from databricks.labs.lakebridge.deployment.dashboard import DashboardDeployment
@@ -22,6 +23,7 @@ from databricks.labs.lakebridge.helpers.metastore import CatalogOperations
 logger = logging.getLogger(__name__)
+# pylint: disable=too-many-public-methods
 class ApplicationContext:
     def __init__(self, ws: WorkspaceClient):
         self._ws = ws
@@ -131,3 +133,8 @@ class ApplicationContext:
     @cached_property
     def upgrades(self):
         return Upgrades(self.product_info, self.installation)
+    @cached_property
+    def analyzer(self):
+        is_debug = logger.getEffectiveLevel() == logging.DEBUG
+        return LakebridgeAnalyzer(self.current_user, self.prompts, is_debug)

databricks/labs/lakebridge/deployment/job.py CHANGED Viewed

@@ -112,8 +112,8 @@ class JobDeployment:
         libraries = [
             compute.Library(whl=remorph_wheel_path),
         ]
-        source = recon_config.data_source
-        if source == ReconSourceType.ORACLE.value:
+        if recon_config.data_source == ReconSourceType.ORACLE.value:
             # TODO: Automatically fetch a version list for `ojdbc8`
             oracle_driver_version = "23.4.0.24.05"
             libraries.append(

databricks/labs/lakebridge/helpers/file_utils.py CHANGED Viewed

@@ -1,8 +1,13 @@
 import contextlib
+import logging
 import os
 from pathlib import Path
+from shutil import move, Error
+from datetime import datetime
 from collections.abc import Generator
+logger = logging.getLogger(__name__)
 def is_sql_file(file: str | Path) -> bool:
     """
@@ -63,3 +68,34 @@ def chdir(new_path: Path) -> Generator[None, None, None]:
         yield
     finally:
         os.chdir(saved_path)
+def check_path(path: str) -> bool:
+    """Validates a path for both existing files and writable files."""
+    try:
+        path_obj = Path(path) if not isinstance(path, Path) else path
+        if path_obj.exists():
+            return os.access(path_obj, os.W_OK)
+        parent = path_obj.parent
+        return parent.exists() and os.access(parent, os.W_OK)
+    except OSError as e:
+        logger.warning(f"Could not validate path: {path}, error: {e}")
+        return False
+def move_tmp_file(tmp_path: Path, output_path: Path) -> None:
+    """Process file from a temp directory"""
+    try:
+        move(tmp_path, output_path.parent)
+    except (FileExistsError, Error):
+        timestamp = datetime.now().strftime("%Y%m%d-%H%M%S")
+        new_output_path = output_path.parent / timestamp
+        new_output_path.mkdir(exist_ok=True)
+        move(tmp_path, new_output_path)
+    finally:
+        tmp_path.parent.rmdir()
+        logger.info(f"Results store at {output_path}")

databricks/labs/lakebridge/helpers/validation.py CHANGED Viewed

@@ -37,19 +37,21 @@ class Validator:
             config.catalog_name,
             config.schema_name,
         )
+        # Some errors doesn't return the query test alon with the error message so need to handle those separately
+        static_errors_lkp = ["[UNRESOLVED_ROUTINE]", "[UNRESOLVED_COLUMN.WITHOUT_SUGGESTION]"]
         if is_valid:
             result = sql_text
             if exception_type is not None:
                 exception_msg = f"[{exception_type.upper()}]: {exception_msg}"
         else:
             query = ""
-            if "[UNRESOLVED_ROUTINE]" in str(exception_msg):
+            if any(err in str(exception_msg) for err in static_errors_lkp):
                 query = sql_text
             buffer = StringIO()
             buffer.write("-------------- Exception Start-------------------\n")
-            buffer.write("/* \n")
+            buffer.write("/*\n")
             buffer.write(str(exception_msg))
-            buffer.write("\n */ \n")
+            buffer.write("\n*/\n")
             buffer.write(query)
             buffer.write("\n ---------------Exception End --------------------\n")

databricks-labs-lakebridge 0.10.6__py3-none-any.whl → 0.10.8__py3-none-any.whl

databricks-labs-lakebridge 0.10.6py3-none-any.whl → 0.10.8py3-none-any.whl