PyPI - databricks-labs-lakebridge - Versions diffs - 0.10.6__py3-none-any.whl → 0.10.7__py3-none-any.whl - Mend

databricks-labs-lakebridge 0.10.6py3-none-any.whl → 0.10.7py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (37) hide show

databricks/labs/lakebridge/reconcile/schema_compare.py CHANGED Viewed

@@ -1,10 +1,10 @@
 import logging
-from dataclasses import asdict
 from pyspark.sql import DataFrame, SparkSession
 from pyspark.sql.types import BooleanType, StringType, StructField, StructType
 from sqlglot import Dialect, parse_one
+from databricks.labs.lakebridge.reconcile.connectors.dialect_utils import DialectUtils
 from databricks.labs.lakebridge.transpiler.sqlglot.dialect_utils import get_dialect
 from databricks.labs.lakebridge.reconcile.recon_config import Schema, Table
 from databricks.labs.lakebridge.reconcile.recon_output_config import SchemaMatchResult, SchemaReconcileOutput
@@ -20,8 +20,7 @@ class SchemaCompare:
     ):
         self.spark = spark
-    # Define the schema for the schema compare DataFrame
-    _schema_compare_schema: StructType = StructType(
+    _schema_compare_output_schema: StructType = StructType(
         [
             StructField("source_column", StringType(), False),
             StructField("source_datatype", StringType(), False),
@@ -47,14 +46,16 @@ class SchemaCompare:
         target_column_map = table_conf.to_src_col_map or {}
         master_schema_match_res = [
             SchemaMatchResult(
-                source_column=s.column_name,
-                databricks_column=target_column_map.get(s.column_name, s.column_name),
+                source_column_normalized=s.source_normalized_column_name,
+                source_column_normalized_ansi=s.ansi_normalized_column_name,
                 source_datatype=s.data_type,
+                databricks_column=target_column_map.get(s.ansi_normalized_column_name, s.ansi_normalized_column_name),
                 databricks_datatype=next(
                     (
                         tgt.data_type
                         for tgt in databricks_schema
-                        if tgt.column_name == target_column_map.get(s.column_name, s.column_name)
+                        if tgt.ansi_normalized_column_name
+                        == target_column_map.get(s.ansi_normalized_column_name, s.ansi_normalized_column_name)
                     ),
                     "",
                 ),
@@ -63,16 +64,22 @@ class SchemaCompare:
         ]
         return master_schema_match_res
-    def _create_dataframe(self, data: list, schema: StructType) -> DataFrame:
-        """
-        :param data: Expectation is list of dataclass
-        :param schema: Target schema
-        :return: DataFrame
-        """
-        data = [tuple(asdict(item).values()) for item in data]
-        df = self.spark.createDataFrame(data, schema)
+    def _create_output_dataframe(self, data: list[SchemaMatchResult], schema: StructType) -> DataFrame:
+        """Return a user-friendly dataframe for schema compare result."""
+        transformed = []
+        for item in data:
+            output = tuple(
+                [
+                    DialectUtils.unnormalize_identifier(item.source_column_normalized_ansi),
+                    item.source_datatype,
+                    DialectUtils.unnormalize_identifier(item.databricks_column),
+                    item.databricks_datatype,
+                    item.is_valid,
+                ]
+            )
+            transformed.append(output)
-        return df
+        return self.spark.createDataFrame(transformed, schema)
     @classmethod
     def _parse(cls, source: Dialect, column: str, data_type: str) -> str:
@@ -88,10 +95,10 @@ class SchemaCompare:
     @classmethod
     def _validate_parsed_query(cls, master: SchemaMatchResult, parsed_query) -> None:
-        databricks_query = f"create table dummy ({master.source_column} {master.databricks_datatype})"
+        databricks_query = f"create table dummy ({master.source_column_normalized_ansi} {master.databricks_datatype})"
         logger.info(
             f"""
-        Source datatype: create table dummy ({master.source_column} {master.source_datatype})
+        Source datatype: create table dummy ({master.source_column_normalized} {master.source_datatype})
         Parse datatype: {parsed_query}
         Databricks datatype: {databricks_query}
         """
@@ -116,11 +123,11 @@ class SchemaCompare:
         master_schema = self._build_master_schema(source_schema, databricks_schema, table_conf)
         for master in master_schema:
             if not isinstance(source, Databricks):
-                parsed_query = self._parse(source, master.source_column, master.source_datatype)
+                parsed_query = self._parse(source, master.source_column_normalized, master.source_datatype)
                 self._validate_parsed_query(master, parsed_query)
             elif master.source_datatype.lower() != master.databricks_datatype.lower():
                 master.is_valid = False
-        df = self._create_dataframe(master_schema, self._schema_compare_schema)
+        df = self._create_output_dataframe(master_schema, self._schema_compare_output_schema)
         final_result = self._table_schema_status(master_schema)
         return SchemaReconcileOutput(final_result, df)

databricks/labs/lakebridge/reconcile/trigger_recon_aggregate_service.py ADDED Viewed

@@ -0,0 +1,98 @@
+from datetime import datetime
+from pyspark.sql import SparkSession
+from databricks.sdk import WorkspaceClient
+from databricks.labs.lakebridge.config import ReconcileConfig, TableRecon
+from databricks.labs.lakebridge.reconcile import utils
+from databricks.labs.lakebridge.reconcile.exception import DataSourceRuntimeException, ReconciliationException
+from databricks.labs.lakebridge.reconcile.recon_capture import (
+    ReconIntermediatePersist,
+    generate_final_reconcile_aggregate_output,
+)
+from databricks.labs.lakebridge.reconcile.recon_config import Table, Schema, AGG_RECONCILE_OPERATION_NAME
+from databricks.labs.lakebridge.reconcile.recon_output_config import (
+    ReconcileProcessDuration,
+    AggregateQueryOutput,
+    DataReconcileOutput,
+)
+from databricks.labs.lakebridge.reconcile.reconciliation import Reconciliation
+from databricks.labs.lakebridge.reconcile.trigger_recon_service import TriggerReconService
+from databricks.labs.lakebridge.reconcile.normalize_recon_config_service import NormalizeReconConfigService
+class TriggerReconAggregateService:
+    @staticmethod
+    def trigger_recon_aggregates(
+        ws: WorkspaceClient,
+        spark: SparkSession,
+        table_recon: TableRecon,
+        reconcile_config: ReconcileConfig,
+        local_test_run: bool = False,
+    ):
+        reconciler, recon_capture = TriggerReconService.create_recon_dependencies(
+            ws, spark, reconcile_config, local_test_run
+        )
+        # Get the Aggregated Reconciliation Output for each table
+        for table_conf in table_recon.tables:
+            normalized_table_conf = NormalizeReconConfigService(
+                reconciler.source, reconciler.target
+            ).normalize_recon_table_config(table_conf)
+            recon_process_duration = ReconcileProcessDuration(start_ts=str(datetime.now()), end_ts=None)
+            try:
+                src_schema, tgt_schema = TriggerReconService.get_schemas(
+                    reconciler.source, reconciler.target, normalized_table_conf, reconcile_config.database_config
+                )
+            except DataSourceRuntimeException as e:
+                raise ReconciliationException(message=str(e)) from e
+            assert normalized_table_conf.aggregates, "Aggregates must be defined for Aggregates Reconciliation"
+            table_reconcile_agg_output_list: list[AggregateQueryOutput] = (
+                TriggerReconAggregateService._run_reconcile_aggregates(
+                    reconciler=reconciler,
+                    table_conf=normalized_table_conf,
+                    src_schema=src_schema,
+                    tgt_schema=tgt_schema,
+                )
+            )
+            recon_process_duration.end_ts = str(datetime.now())
+            # Persist the data to the delta tables
+            recon_capture.store_aggregates_metrics(
+                reconcile_agg_output_list=table_reconcile_agg_output_list,
+                table_conf=normalized_table_conf,
+                recon_process_duration=recon_process_duration,
+            )
+            (
+                ReconIntermediatePersist(
+                    spark=spark,
+                    path=utils.generate_volume_path(normalized_table_conf, reconcile_config.metadata_config),
+                ).clean_unmatched_df_from_volume()
+            )
+        return TriggerReconService.verify_successful_reconciliation(
+            generate_final_reconcile_aggregate_output(
+                recon_id=recon_capture.recon_id,
+                spark=spark,
+                metadata_config=reconcile_config.metadata_config,
+                local_test_run=local_test_run,
+            ),
+            operation_name=AGG_RECONCILE_OPERATION_NAME,
+        )
+    @staticmethod
+    def _run_reconcile_aggregates(
+        reconciler: Reconciliation,
+        table_conf: Table,
+        src_schema: list[Schema],
+        tgt_schema: list[Schema],
+    ) -> list[AggregateQueryOutput]:
+        try:
+            return reconciler.reconcile_aggregates(table_conf, src_schema, tgt_schema)
+        except DataSourceRuntimeException as e:
+            return [AggregateQueryOutput(reconcile_output=DataReconcileOutput(exception=str(e)), rule=None)]

databricks/labs/lakebridge/reconcile/trigger_recon_service.py ADDED Viewed

@@ -0,0 +1,253 @@
+import logging
+from datetime import datetime
+from uuid import uuid4
+from pyspark.errors import PySparkException
+from pyspark.sql import SparkSession
+from databricks.sdk import WorkspaceClient
+from databricks.labs.lakebridge.config import ReconcileConfig, TableRecon, DatabaseConfig
+from databricks.labs.lakebridge.reconcile import utils
+from databricks.labs.lakebridge.reconcile.connectors.data_source import DataSource
+from databricks.labs.lakebridge.reconcile.exception import DataSourceRuntimeException, ReconciliationException
+from databricks.labs.lakebridge.reconcile.recon_capture import (
+    ReconCapture,
+    ReconIntermediatePersist,
+    generate_final_reconcile_output,
+)
+from databricks.labs.lakebridge.reconcile.recon_config import Table, Schema
+from databricks.labs.lakebridge.reconcile.recon_output_config import (
+    ReconcileOutput,
+    ReconcileProcessDuration,
+    SchemaReconcileOutput,
+    DataReconcileOutput,
+)
+from databricks.labs.lakebridge.reconcile.reconciliation import Reconciliation
+from databricks.labs.lakebridge.reconcile.schema_compare import SchemaCompare
+from databricks.labs.lakebridge.reconcile.normalize_recon_config_service import NormalizeReconConfigService
+from databricks.labs.lakebridge.transpiler.execute import verify_workspace_client
+from databricks.labs.lakebridge.transpiler.sqlglot.dialect_utils import get_dialect
+logger = logging.getLogger(__name__)
+_RECON_REPORT_TYPES = {"schema", "data", "row", "all", "aggregate"}
+class TriggerReconService:
+    @staticmethod
+    def trigger_recon(
+        ws: WorkspaceClient,
+        spark: SparkSession,
+        table_recon: TableRecon,
+        reconcile_config: ReconcileConfig,
+        local_test_run: bool = False,
+    ) -> ReconcileOutput:
+        reconciler, recon_capture = TriggerReconService.create_recon_dependencies(
+            ws, spark, reconcile_config, local_test_run
+        )
+        for table_conf in table_recon.tables:
+            TriggerReconService.recon_one(spark, reconciler, recon_capture, reconcile_config, table_conf)
+        return TriggerReconService.verify_successful_reconciliation(
+            generate_final_reconcile_output(
+                recon_id=recon_capture.recon_id,
+                spark=spark,
+                metadata_config=reconcile_config.metadata_config,
+                local_test_run=local_test_run,
+            )
+        )
+    @staticmethod
+    def create_recon_dependencies(
+        ws: WorkspaceClient, spark: SparkSession, reconcile_config: ReconcileConfig, local_test_run: bool = False
+    ) -> tuple[Reconciliation, ReconCapture]:
+        ws_client: WorkspaceClient = verify_workspace_client(ws)
+        # validate the report type
+        report_type = reconcile_config.report_type.lower()
+        logger.info(f"report_type: {report_type}, data_source: {reconcile_config.data_source} ")
+        utils.validate_input(report_type, _RECON_REPORT_TYPES, "Invalid report type")
+        source, target = utils.initialise_data_source(
+            engine=reconcile_config.data_source,
+            spark=spark,
+            ws=ws_client,
+            secret_scope=reconcile_config.secret_scope,
+        )
+        recon_id = str(uuid4())
+        # initialise the Reconciliation
+        reconciler = Reconciliation(
+            source,
+            target,
+            reconcile_config.database_config,
+            report_type,
+            SchemaCompare(spark=spark),
+            get_dialect(reconcile_config.data_source),
+            spark,
+            metadata_config=reconcile_config.metadata_config,
+        )
+        recon_capture = ReconCapture(
+            database_config=reconcile_config.database_config,
+            recon_id=recon_id,
+            report_type=report_type,
+            source_dialect=get_dialect(reconcile_config.data_source),
+            ws=ws_client,
+            spark=spark,
+            metadata_config=reconcile_config.metadata_config,
+            local_test_run=local_test_run,
+        )
+        return reconciler, recon_capture
+    @staticmethod
+    def recon_one(
+        spark: SparkSession,
+        reconciler: Reconciliation,
+        recon_capture: ReconCapture,
+        reconcile_config: ReconcileConfig,
+        table_conf: Table,
+    ):
+        normalized_table_conf = NormalizeReconConfigService(
+            reconciler.source, reconciler.target
+        ).normalize_recon_table_config(table_conf)
+        schema_reconcile_output, data_reconcile_output, recon_process_duration = TriggerReconService._do_recon_one(
+            reconciler, reconcile_config, normalized_table_conf
+        )
+        TriggerReconService.persist_delta_table(
+            spark,
+            reconciler,
+            recon_capture,
+            schema_reconcile_output,
+            data_reconcile_output,
+            reconcile_config,
+            normalized_table_conf,
+            recon_process_duration,
+        )
+    @staticmethod
+    def _do_recon_one(reconciler: Reconciliation, reconcile_config: ReconcileConfig, table_conf: Table):
+        recon_process_duration = ReconcileProcessDuration(start_ts=str(datetime.now()), end_ts=None)
+        schema_reconcile_output = SchemaReconcileOutput(is_valid=True)
+        data_reconcile_output = DataReconcileOutput()
+        try:
+            src_schema, tgt_schema = TriggerReconService.get_schemas(
+                reconciler.source, reconciler.target, table_conf, reconcile_config.database_config
+            )
+        except DataSourceRuntimeException as e:
+            schema_reconcile_output = SchemaReconcileOutput(is_valid=False, exception=str(e))
+        else:
+            if reconciler.report_type in {"schema", "all"}:
+                schema_reconcile_output = TriggerReconService._run_reconcile_schema(
+                    reconciler=reconciler,
+                    table_conf=table_conf,
+                    src_schema=src_schema,
+                    tgt_schema=tgt_schema,
+                )
+                logger.warning("Schema comparison is completed.")
+            if reconciler.report_type in {"data", "row", "all"}:
+                data_reconcile_output = TriggerReconService._run_reconcile_data(
+                    reconciler=reconciler,
+                    table_conf=table_conf,
+                    src_schema=src_schema,
+                    tgt_schema=tgt_schema,
+                )
+                logger.warning(f"Reconciliation for '{reconciler.report_type}' report completed.")
+        recon_process_duration.end_ts = str(datetime.now())
+        return schema_reconcile_output, data_reconcile_output, recon_process_duration
+    @staticmethod
+    def get_schemas(
+        source: DataSource,
+        target: DataSource,
+        table_conf: Table,
+        database_config: DatabaseConfig,
+    ) -> tuple[list[Schema], list[Schema]]:
+        src_schema = source.get_schema(
+            catalog=database_config.source_catalog,
+            schema=database_config.source_schema,
+            table=table_conf.source_name,
+        )
+        tgt_schema = target.get_schema(
+            catalog=database_config.target_catalog,
+            schema=database_config.target_schema,
+            table=table_conf.target_name,
+        )
+        return src_schema, tgt_schema
+    @staticmethod
+    def _run_reconcile_schema(
+        reconciler: Reconciliation,
+        table_conf: Table,
+        src_schema: list[Schema],
+        tgt_schema: list[Schema],
+    ):
+        try:
+            return reconciler.reconcile_schema(table_conf=table_conf, src_schema=src_schema, tgt_schema=tgt_schema)
+        except PySparkException as e:
+            return SchemaReconcileOutput(is_valid=False, exception=str(e))
+    @staticmethod
+    def _run_reconcile_data(
+        reconciler: Reconciliation,
+        table_conf: Table,
+        src_schema: list[Schema],
+        tgt_schema: list[Schema],
+    ) -> DataReconcileOutput:
+        try:
+            return reconciler.reconcile_data(table_conf=table_conf, src_schema=src_schema, tgt_schema=tgt_schema)
+        except DataSourceRuntimeException as e:
+            return DataReconcileOutput(exception=str(e))
+    @staticmethod
+    def persist_delta_table(
+        spark: SparkSession,
+        reconciler: Reconciliation,
+        recon_capture: ReconCapture,
+        schema_reconcile_output: SchemaReconcileOutput,
+        data_reconcile_output: DataReconcileOutput,
+        reconcile_config: ReconcileConfig,
+        table_conf: Table,
+        recon_process_duration: ReconcileProcessDuration,
+    ):
+        recon_capture.start(
+            data_reconcile_output=data_reconcile_output,
+            schema_reconcile_output=schema_reconcile_output,
+            table_conf=table_conf,
+            recon_process_duration=recon_process_duration,
+            record_count=reconciler.get_record_count(table_conf, reconciler.report_type),
+        )
+        if reconciler.report_type != "schema":
+            ReconIntermediatePersist(
+                spark=spark, path=utils.generate_volume_path(table_conf, reconcile_config.metadata_config)
+            ).clean_unmatched_df_from_volume()
+    @staticmethod
+    def verify_successful_reconciliation(
+        reconcile_output: ReconcileOutput, operation_name: str = "reconcile"
+    ) -> ReconcileOutput:
+        for table_output in reconcile_output.results:
+            if table_output.exception_message or (
+                table_output.status.column is False
+                or table_output.status.row is False
+                or table_output.status.schema is False
+                or table_output.status.aggregate is False
+            ):
+                raise ReconciliationException(
+                    f" Reconciliation failed for one or more tables. Please check the recon metrics for more details."
+                    f" **{operation_name}** failed.",
+                    reconcile_output=reconcile_output,
+                )
+        logger.info("Reconciliation completed successfully.")
+        return reconcile_output

databricks/labs/lakebridge/reconcile/utils.py ADDED Viewed

@@ -0,0 +1,38 @@
+import logging
+from pyspark.sql import SparkSession
+from databricks.sdk import WorkspaceClient
+from databricks.labs.lakebridge.config import ReconcileMetadataConfig
+from databricks.labs.lakebridge.reconcile.connectors.source_adapter import create_adapter
+from databricks.labs.lakebridge.reconcile.exception import InvalidInputException
+from databricks.labs.lakebridge.reconcile.recon_config import Table
+from databricks.labs.lakebridge.transpiler.sqlglot.dialect_utils import get_dialect
+logger = logging.getLogger(__name__)
+def initialise_data_source(
+    ws: WorkspaceClient,
+    spark: SparkSession,
+    engine: str,
+    secret_scope: str,
+):
+    source = create_adapter(engine=get_dialect(engine), spark=spark, ws=ws, secret_scope=secret_scope)
+    target = create_adapter(engine=get_dialect("databricks"), spark=spark, ws=ws, secret_scope=secret_scope)
+    return source, target
+def validate_input(input_value: str, list_of_value: set, message: str):
+    if input_value not in list_of_value:
+        error_message = f"{message} --> {input_value} is not one of {list_of_value}"
+        logger.error(error_message)
+        raise InvalidInputException(error_message)
+def generate_volume_path(table_conf: Table, metadata_config: ReconcileMetadataConfig):
+    catalog = metadata_config.catalog
+    schema = metadata_config.schema
+    return f"/Volumes/{catalog}/{schema}/{metadata_config.volume}/{table_conf.source_name}_{table_conf.target_name}/"

databricks/labs/lakebridge/transpiler/lsp/lsp_engine.py CHANGED Viewed

@@ -4,7 +4,9 @@ import abc
 import asyncio
 import logging
 import os
+import shutil
 import sys
+import venv
 from collections.abc import Callable, Sequence, Mapping
 from dataclasses import dataclass
 from pathlib import Path
@@ -35,7 +37,7 @@ from pygls.lsp.client import BaseLanguageClient
 from databricks.labs.blueprint.wheels import ProductInfo
 from databricks.labs.lakebridge.config import LSPConfigOptionV1, TranspileConfig, TranspileResult
 from databricks.labs.lakebridge.errors.exceptions import IllegalStateException
-from databricks.labs.lakebridge.helpers.file_utils import chdir, is_dbt_project_file, is_sql_file
+from databricks.labs.lakebridge.helpers.file_utils import is_dbt_project_file, is_sql_file
 from databricks.labs.lakebridge.transpiler.transpile_engine import TranspileEngine
 from databricks.labs.lakebridge.transpiler.transpile_status import (
     CodePosition,
@@ -409,9 +411,7 @@ class LSPEngine(TranspileEngine):
         if self.is_alive:
             raise IllegalStateException("LSP engine is already initialized")
         try:
-            # TODO: Avoid this by setting the working directory when launching the child process.
-            with chdir(self._workdir):
-                await self._do_initialize(config)
+            await self._do_initialize(config)
             await self._await_for_transpile_capability()
         # it is good practice to catch broad exceptions raised by launching a child process
         except Exception as e:  # pylint: disable=broad-exception-caught
@@ -432,65 +432,50 @@ class LSPEngine(TranspileEngine):
         logger.debug(f"LSP init params: {params}")
         self._init_response = await self._client.initialize_async(params)
-    async def _start_server(self):
-        executable = self._config.remorph.command_line[0]
-        if executable in {"python", "python3"}:
-            await self._start_python_server()
-        else:
-            await self._start_other_server()
-    async def _start_python_server(self):
-        has_venv = (self._workdir / ".venv").exists()
-        if has_venv:
-            await self._start_python_server_with_venv()
-        else:
-            await self._start_python_server_without_venv()
-    async def _start_python_server_with_venv(self):
-        env: dict[str, str] = os.environ | self._config.remorph.env_vars
-        # ensure modules are searched within venv
-        if "PYTHONPATH" in env.keys():
-            del env["PYTHONPATH"]
-        if "VIRTUAL_ENV" in env.keys():
-            del env["VIRTUAL_ENV"]
-        if "VIRTUAL_ENV_PROMPT" in env.keys():
-            del env["VIRTUAL_ENV_PROMPT"]
-        path = self._workdir / ".venv" / "Scripts" if sys.platform == "win32" else self._workdir / ".venv" / "bin"
-        if "PATH" in env.keys():
-            env["PATH"] = str(path) + os.pathsep + env["PATH"]
-        else:
-            env["PATH"] = str(path)
-        python = "python.exe" if sys.platform == "win32" else "python3"
-        executable = path / python
-        await self._launch_executable(executable, env)
-    async def _start_python_server_without_venv(self):
-        env: dict[str, str] = os.environ | self._config.remorph.env_vars
-        # ensure modules are searched locally before being searched in remorph
-        if "PYTHONPATH" in env.keys():
-            env["PYTHONPATH"] = str(self._workdir) + os.pathsep + env["PYTHONPATH"]
-        else:
-            env["PYTHONPATH"] = str(self._workdir)
-        executable = Path(self._config.remorph.command_line[0])
-        await self._launch_executable(executable, env)
+    async def _start_server(self) -> None:
+        # Sanity-check and split the command-line into components.
+        if not (command_line := self._config.remorph.command_line):
+            raise ValueError(f"Missing command line for LSP server: {self._config.path}")
+        executable, *args = command_line
-    async def _start_other_server(self):
+        # Extract the environment, preparing to ensure that PATH is set correctly.
         env: dict[str, str] = os.environ | self._config.remorph.env_vars
-        # ensure modules are searched within venv
-        if "PYTHONPATH" in env.keys():
-            del env["PYTHONPATH"]
-        if "VIRTUAL_ENV" in env.keys():
-            del env["VIRTUAL_ENV"]
-        if "VIRTUAL_ENV_PROMPT" in env.keys():
-            del env["VIRTUAL_ENV_PROMPT"]
-        executable = Path(self._config.remorph.command_line[0])
-        await self._launch_executable(executable, env)
-    async def _launch_executable(self, executable: Path, env: Mapping):
+        path = env.get("PATH", os.defpath)
+        # If we have a virtual environment, ensure the bin directory is first on the PATH. This normally takes
+        # care of python executables, but also deals with any entry-points that the LSP server might install.
+        if (venv_path := self._workdir / ".venv").exists():
+            executable, additional_path = self._activate_venv(venv_path, executable)
+            # Ensure PATH is in sync with the search path we will use to locate the LSP server executable.
+            env["PATH"] = path = f"{additional_path}{os.pathsep}{path}"
+        logger.debug(f"Using PATH for launching LSP server: {path}")
+        # Locate the LSP server executable in a platform-independent way.
+        # Reference: https://docs.python.org/3/library/subprocess.html#popen-constructor
+        executable = shutil.which(executable, path=path) or executable
+        await self._launch_executable(executable, args, env)
+    @staticmethod
+    def _activate_venv(venv_path: Path, executable: str) -> tuple[str, Path]:
+        """Obtain the bin/script directory for the virtual environment, to extend the search path."""
+        logger.debug(f"Detected virtual environment to use at: {venv_path}")
+        use_symlinks = sys.platform != "win32"
+        builder = venv.EnvBuilder(symlinks=use_symlinks)
+        context = builder.ensure_directories(venv_path)
+        # Workaround for Windows, where bin_path (Scripts/) doesn't contain python3.exe: if the executable is python
+        # or python3, we substitute it for what is needed to launch the venv's python interpreter.
+        if os.path.normcase(executable) in {"python", "python3"}:
+            executable = context.env_exec_cmd
+        return executable, context.bin_path
+    async def _launch_executable(self, executable: str, args: Sequence[str], env: Mapping[str, str]) -> None:
         log_level = logging.getLevelName(logging.getLogger("databricks").level)
-        args = self._config.remorph.command_line[1:] + [f"--log_level={log_level}"]
-        logger.debug(f"Starting LSP engine: {executable} {args} (cwd={os.getcwd()})")
-        await self._client.start_io(str(executable), env=env, *args)
+        args = [*args, f"--log_level={log_level}"]
+        logger.debug(f"Starting LSP engine: {executable} {args} (cwd={self._workdir})")
+        await self._client.start_io(executable, *args, env=env, cwd=self._workdir)
     def _client_capabilities(self):
         return ClientCapabilities()  # TODO do we need to refine this ?

databricks/labs/lakebridge/transpiler/sqlglot/dialect_utils.py CHANGED Viewed

@@ -18,6 +18,8 @@ SQLGLOT_DIALECTS: dict[str, type[Dialect] | str] = {
     "teradata": Dialects.TERADATA,
     "trino": Dialects.TRINO,
     "tsql": Dialects.TSQL,
+    "mssql": Dialects.TSQL,
+    "synapse": Dialects.TSQL,
     "vertica": Dialects.POSTGRES,
 }

databricks/labs/lakebridge/transpiler/transpile_engine.py CHANGED Viewed

@@ -6,24 +6,6 @@ from databricks.labs.lakebridge.config import TranspileResult, TranspileConfig
 class TranspileEngine(abc.ABC):
-    @classmethod
-    def load_engine(cls, transpiler_config_path: Path) -> TranspileEngine:
-        # TODO remove this once sqlglot transpiler is pluggable
-        if str(transpiler_config_path) == "sqlglot":
-            # pylint: disable=import-outside-toplevel, cyclic-import
-            from databricks.labs.lakebridge.transpiler.sqlglot.sqlglot_engine import SqlglotEngine
-            return SqlglotEngine()
-        if not transpiler_config_path.exists():
-            raise ValueError(
-                f"Error: Invalid value for '--transpiler-config-path': '{str(transpiler_config_path)}', file does not exist."
-            )
-        # pylint: disable=import-outside-toplevel, cyclic-import
-        from databricks.labs.lakebridge.transpiler.lsp.lsp_engine import LSPEngine
-        return LSPEngine.from_config_path(transpiler_config_path)
     @abc.abstractmethod
     async def initialize(self, config: TranspileConfig) -> None: ...

{databricks_labs_lakebridge-0.10.6.dist-info → databricks_labs_lakebridge-0.10.7.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: databricks-labs-lakebridge
-Version: 0.10.6
+Version: 0.10.7
 Summary: Fast and predictable migrations to Databricks Lakehouse Platform. This tool is designed to help you migrate your data and workloads to the Databricks Lakehouse Platform in a fast, predictable, and reliable way. It provides a set of tools and utilities to help you reconcile your data and workloads, assess your current state, and plan your migration.
 Project-URL: Documentation, https://databrickslabs.github.io/lakebridge
 Project-URL: Issues, https://github.com/databrickslabs/lakebridge/issues

databricks-labs-lakebridge 0.10.6__py3-none-any.whl → 0.10.7__py3-none-any.whl

databricks-labs-lakebridge 0.10.6py3-none-any.whl → 0.10.7py3-none-any.whl