PyPI - databricks-labs-lakebridge - Versions diffs - 0.10.0__py3-none-any.whl - Mend

databricks-labs-lakebridge 0.10.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (171) hide show

databricks/__init__.py ADDED Viewed

@@ -0,0 +1,3 @@
+# DO NOT ADD ANYTHING ELSE TO THIS FILE FOR COMPATIBILITY WITH OTHER databricks.* PACKAGES
+# SEE https://packaging.python.org/guides/packaging-namespace-packages/#pkgutil-style-namespace-packages
+__path__ = __import__("pkgutil").extend_path(__path__, __name__)

databricks/labs/__init__.py ADDED Viewed

@@ -0,0 +1,3 @@
+# DO NOT ADD ANYTHING ELSE TO THIS FILE FOR COMPATIBILITY WITH OTHER databricks.* PACKAGES
+# SEE https://packaging.python.org/guides/packaging-namespace-packages/#pkgutil-style-namespace-packages
+__path__ = __import__("pkgutil").extend_path(__path__, __name__)

databricks/labs/lakebridge/__about__.py ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ # DO NOT MODIFY THIS FILE
2	+ __version__ = "0.10.0"

databricks/labs/lakebridge/__init__.py ADDED Viewed

@@ -0,0 +1,11 @@
+from databricks.sdk.core import with_user_agent_extra, with_product
+from databricks.labs.blueprint.logger import install_logger
+from databricks.labs.lakebridge.__about__ import __version__
+install_logger()
+# Add lakebridge/<version> for projects depending on lakebridge as a library
+with_user_agent_extra("lakebridge", __version__)
+# Add lakebridge/<version> for re-packaging of lakebridge, where product name is omitted
+with_product("lakebridge", __version__)

databricks/labs/lakebridge/assessments/configure_assessment.py ADDED Viewed

@@ -0,0 +1,194 @@
+from abc import ABC, abstractmethod
+import logging
+import shutil
+import yaml
+from databricks.labs.blueprint.tui import Prompts
+from databricks.labs.lakebridge.connections.credential_manager import (
+    cred_file as creds,
+    CredentialManager,
+    create_credential_manager,
+)
+from databricks.labs.lakebridge.connections.database_manager import DatabaseManager
+from databricks.labs.lakebridge.connections.env_getter import EnvGetter
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
+PROFILER_SOURCE_SYSTEM = ["mssql", "synapse"]
+class AssessmentConfigurator(ABC):
+    """Abstract base class for assessment configuration."""
+    def __init__(self, product_name: str, prompts: Prompts, credential_file=None):
+        self.prompts = prompts
+        self._product_name = product_name
+        self._credential_file = creds(product_name) if not credential_file else credential_file
+    @abstractmethod
+    def _configure_credentials(self) -> str:
+        pass
+    @staticmethod
+    def _test_connection(source: str, cred_manager: CredentialManager):
+        config = cred_manager.get_credentials(source)
+        try:
+            db_manager = DatabaseManager(source, config)
+            if db_manager.check_connection():
+                logger.info("Connection to the source system successful")
+            else:
+                logger.error("Connection to the source system failed, check logs in debug mode")
+                raise SystemExit("Connection validation failed. Exiting...")
+        except ConnectionError as e:
+            logger.error(f"Failed to connect to the source system: {e}")
+            raise SystemExit("Connection validation failed. Exiting...") from e
+    def run(self):
+        """Run the assessment configuration process."""
+        logger.info(f"Welcome to the {self._product_name} Assessment Configuration")
+        source = self._configure_credentials()
+        logger.info(f"{source.capitalize()} details and credentials received.")
+        if self.prompts.confirm(f"Do you want to test the connection to {source}?"):
+            cred_manager = create_credential_manager("lakebridge", EnvGetter())
+            if cred_manager:
+                self._test_connection(source, cred_manager)
+        logger.info(f"{source.capitalize()} Assessment Configuration Completed")
+class ConfigureSqlServerAssessment(AssessmentConfigurator):
+    """SQL Server specific assessment configuration."""
+    def _configure_credentials(self) -> str:
+        cred_file = self._credential_file
+        source = "mssql"
+        logger.info(
+            "\n(local | env) \nlocal means values are read as plain text \nenv means values are read "
+            "from environment variables fall back to plain text if not variable is not found\n",
+        )
+        secret_vault_type = str(self.prompts.choice("Enter secret vault type (local | env)", ["local", "env"])).lower()
+        secret_vault_name = None
+        logger.info("Please refer to the documentation to understand the difference between local and env.")
+        credential = {
+            "secret_vault_type": secret_vault_type,
+            "secret_vault_name": secret_vault_name,
+            source: {
+                "database": self.prompts.question("Enter the database name"),
+                "driver": self.prompts.question("Enter the driver details"),
+                "server": self.prompts.question("Enter the server or host details"),
+                "port": int(self.prompts.question("Enter the port details", valid_number=True)),
+                "user": self.prompts.question("Enter the user details"),
+                "password": self.prompts.question("Enter the password details"),
+            },
+        }
+        if cred_file.exists():
+            backup_filename = cred_file.with_suffix('.bak')
+            shutil.copy(cred_file, backup_filename)
+            logger.debug(f"Backup of the existing file created at {backup_filename}")
+        with open(cred_file, 'w', encoding='utf-8') as file:
+            yaml.dump(credential, file, default_flow_style=False)
+        logger.info(f"Credential template created for {source}.")
+        return source
+class ConfigureSynapseAssessment(AssessmentConfigurator):
+    """Synapse specific assessment configuration."""
+    def _configure_credentials(self) -> str:
+        cred_file = self._credential_file
+        source = "synapse"
+        logger.info(
+            "\n(local | env) \nlocal means values are read as plain text \nenv means values are read "
+            "from environment variables fall back to plain text if not variable is not found\n",
+        )
+        secret_vault_type = str(self.prompts.choice("Enter secret vault type (local | env)", ["local", "env"])).lower()
+        secret_vault_name = None
+        # Synapse Workspace Settings
+        logger.info("Please provide Synapse Workspace settings:")
+        synapse_workspace = {
+            "name": self.prompts.question("Enter Synapse workspace name"),
+            "dedicated_sql_endpoint": self.prompts.question("Enter dedicated SQL endpoint"),
+            "serverless_sql_endpoint": self.prompts.question("Enter serverless SQL endpoint"),
+            "sql_user": self.prompts.question("Enter SQL user"),
+            "sql_password": self.prompts.question("Enter SQL password"),
+            "tz_info": self.prompts.question("Enter timezone (e.g. America/New_York)", default="UTC"),
+        }
+        # Azure API Access Settings
+        logger.info("Please provide Azure API access settings:")
+        azure_api_access = {
+            "development_endpoint": self.prompts.question("Enter development endpoint"),
+            "azure_client_id": self.prompts.question("Enter Azure client ID"),
+            "azure_tenant_id": self.prompts.question("Enter Azure tenant ID"),
+            "azure_client_secret": self.prompts.question("Enter Azure client secret"),
+        }
+        # JDBC Settings
+        logger.info("Please select JDBC authentication type:")
+        auth_type = self.prompts.choice(
+            "Select authentication type", ["sql_authentication", "ad_passwd_authentication", "spn_authentication"]
+        )
+        synapse_jdbc = {
+            "auth_type": auth_type,
+            "fetch_size": self.prompts.question("Enter fetch size", default="1000"),
+            "login_timeout": self.prompts.question("Enter login timeout (seconds)", default="30"),
+        }
+        # Profiler Settings
+        logger.info("Please configure profiler settings:")
+        synapse_profiler = {
+            "exclude_serverless_sql_pool": self.prompts.confirm("Exclude serverless SQL pool from profiling?"),
+            "exclude_dedicated_sql_pools": self.prompts.confirm("Exclude dedicated SQL pools from profiling?"),
+            "exclude_spark_pools": self.prompts.confirm("Exclude Spark pools from profiling?"),
+            "exclude_monitoring_metrics": self.prompts.confirm("Exclude monitoring metrics from profiling?"),
+            "redact_sql_pools_sql_text": self.prompts.confirm("Redact SQL pools SQL text?"),
+        }
+        credential = {
+            "secret_vault_type": secret_vault_type,
+            "secret_vault_name": secret_vault_name,
+            source: {
+                "workspace": synapse_workspace,
+                "azure_api_access": azure_api_access,
+                "jdbc": synapse_jdbc,
+                "profiler": synapse_profiler,
+            },
+        }
+        if cred_file.exists():
+            backup_filename = cred_file.with_suffix('.bak')
+            shutil.copy(cred_file, backup_filename)
+            logger.debug(f"Backup of the existing file created at {backup_filename}")
+        with open(cred_file, 'w', encoding='utf-8') as file:
+            yaml.dump(credential, file, default_flow_style=False)
+        logger.info(f"Credential template created for {source}.")
+        return source
+def create_assessment_configurator(
+    source_system: str, product_name: str, prompts: Prompts, credential_file=None
+) -> AssessmentConfigurator:
+    """Factory function to create the appropriate assessment configurator."""
+    configurators = {
+        "mssql": ConfigureSqlServerAssessment,
+        "synapse": ConfigureSynapseAssessment,
+    }
+    if source_system not in configurators:
+        raise ValueError(f"Unsupported source system: {source_system}")
+    return configurators[source_system](product_name, prompts, credential_file)

databricks/labs/lakebridge/assessments/pipeline.py ADDED Viewed

@@ -0,0 +1,188 @@
+from pathlib import Path
+from subprocess import run, CalledProcessError
+from dataclasses import dataclass
+from enum import Enum
+import venv
+import tempfile
+import json
+import logging
+import yaml
+import duckdb
+from databricks.labs.lakebridge.connections.credential_manager import cred_file
+from databricks.labs.lakebridge.assessments.profiler_config import PipelineConfig, Step
+from databricks.labs.lakebridge.connections.database_manager import DatabaseManager
+logger = logging.getLogger(__name__)
+logger.setLevel("INFO")
+DB_NAME = "profiler_extract.db"
+class StepExecutionStatus(str, Enum):
+    COMPLETE = "COMPLETE"
+    ERROR = "ERROR"
+    SKIPPED = "SKIPPED"
+@dataclass
+class StepExecutionResult:
+    step_name: str
+    status: StepExecutionStatus
+    error_message: str | None = None
+class PipelineClass:
+    def __init__(self, config: PipelineConfig, executor: DatabaseManager):
+        self.config = config
+        self.executor = executor
+        self.db_path_prefix = Path(config.extract_folder)
+    def execute(self) -> list[StepExecutionResult]:
+        logging.info(f"Pipeline initialized with config: {self.config.name}, version: {self.config.version}")
+        execution_results: list[StepExecutionResult] = []
+        for step in self.config.steps:
+            result = self._process_step(step)
+            execution_results.append(result)
+            logging.info(f"Step '{step.name}' completed with status: {result.status}")
+        logging.info("Pipeline execution completed")
+        return execution_results
+    def _process_step(self, step: Step) -> StepExecutionResult:
+        if step.flag != "active":
+            logging.info(f"Skipping step: {step.name} as it is not active")
+            return StepExecutionResult(step_name=step.name, status=StepExecutionStatus.SKIPPED)
+        logging.debug(f"Executing step: {step.name}")
+        try:
+            status = self._execute_step(step)
+            return StepExecutionResult(step_name=step.name, status=status)
+        except RuntimeError as e:
+            return StepExecutionResult(step_name=step.name, status=StepExecutionStatus.ERROR, error_message=str(e))
+    def _execute_step(self, step: Step) -> StepExecutionStatus:
+        if step.type == "sql":
+            logging.info(f"Executing SQL step {step.name}")
+            self._execute_sql_step(step)
+            return StepExecutionStatus.COMPLETE
+        if step.type == "python":
+            logging.info(f"Executing Python step {step.name}")
+            self._execute_python_step(step)
+            return StepExecutionStatus.COMPLETE
+        logging.error(f"Unsupported step type: {step.type}")
+        raise RuntimeError(f"Unsupported step type: {step.type}")
+    def _execute_sql_step(self, step: Step):
+        logging.debug(f"Reading query from file: {step.extract_source}")
+        with open(step.extract_source, 'r', encoding='utf-8') as file:
+            query = file.read()
+        # Execute the query using the database manager
+        logging.info(f"Executing query: {query}")
+        try:
+            result = self.executor.execute_query(query)
+            # Save the result to duckdb
+            self._save_to_db(result, step.name, str(step.mode))
+        except Exception as e:
+            logging.error(f"SQL execution failed: {str(e)}")
+            raise RuntimeError(f"SQL execution failed: {str(e)}") from e
+    def _execute_python_step(self, step: Step):
+        logging.debug(f"Executing Python script: {step.extract_source}")
+        db_path = str(self.db_path_prefix / DB_NAME)
+        credential_config = str(cred_file("lakebridge"))
+        # Create a temporary directory for the virtual environment
+        with tempfile.TemporaryDirectory() as temp_dir:
+            venv_dir = Path(temp_dir) / "venv"
+            venv.create(venv_dir, with_pip=True)
+            venv_python = venv_dir / "bin" / "python"
+            venv_pip = venv_dir / "bin" / "pip"
+            logger.info(f"Creating a virtual environment for Python script execution: ${venv_dir}")
+            # Install dependencies in the virtual environment
+            if step.dependencies:
+                logging.info(f"Installing dependencies: {', '.join(step.dependencies)}")
+                try:
+                    logging.debug("Upgrading local pip")
+                    run([str(venv_pip), "install", "--upgrade", "pip"], check=True, capture_output=True, text=True)
+                    run([str(venv_pip), "install", *step.dependencies], check=True, capture_output=True, text=True)
+                except CalledProcessError as e:
+                    logging.error(f"Failed to install dependencies: {e.stderr}")
+                    raise RuntimeError(f"Failed to install dependencies: {e.stderr}") from e
+            # Execute the Python script using the virtual environment's Python interpreter
+            try:
+                result = run(
+                    [
+                        str(venv_python),
+                        str(step.extract_source),
+                        "--db-path",
+                        db_path,
+                        "--credential-config-path",
+                        credential_config,
+                    ],
+                    check=True,
+                    capture_output=True,
+                    text=True,
+                )
+                try:
+                    output = json.loads(result.stdout)
+                    if output["status"] == "success":
+                        logging.info(f"Python script completed: {output['message']}")
+                    else:
+                        raise RuntimeError(f"Script reported error: {output['message']}")
+                except json.JSONDecodeError:
+                    logging.info(f"Python script output: {result.stdout}")
+            except CalledProcessError as e:
+                error_msg = e.stderr
+                logging.error(f"Python script failed: {error_msg}")
+                raise RuntimeError(f"Script execution failed: {error_msg}") from e
+    def _save_to_db(self, result, step_name: str, mode: str, batch_size: int = 1000):
+        self._create_dir(self.db_path_prefix)
+        db_path = str(self.db_path_prefix / DB_NAME)
+        with duckdb.connect(db_path) as conn:
+            columns = result.keys()
+            # TODO: Add support for figuring out data types from SQLALCHEMY result object result.cursor.description is not reliable
+            schema = ' STRING, '.join(columns) + ' STRING'
+            # Handle write modes
+            if mode == 'overwrite':
+                conn.execute(f"CREATE OR REPLACE TABLE {step_name} ({schema})")
+            elif mode == 'append' and step_name not in conn.get_table_names(""):
+                conn.execute(f"CREATE TABLE {step_name} ({schema})")
+            # Batch insert using prepared statements
+            placeholders = ', '.join(['?' for _ in columns])
+            insert_query = f"INSERT INTO {step_name} VALUES ({placeholders})"
+            # Fetch and insert rows in batches
+            while True:
+                rows = result.fetchmany(batch_size)
+                if not rows:
+                    break
+                conn.executemany(insert_query, rows)
+    @staticmethod
+    def _create_dir(dir_path: Path):
+        if not Path(dir_path).exists():
+            dir_path.mkdir(parents=True, exist_ok=True)
+    @staticmethod
+    def load_config_from_yaml(file_path: str) -> PipelineConfig:
+        with open(file_path, 'r', encoding='utf-8') as file:
+            data = yaml.safe_load(file)
+        steps = [Step(**step) for step in data['steps']]
+        return PipelineConfig(
+            name=data['name'], version=data['version'], extract_folder=data['extract_folder'], steps=steps
+        )

databricks/labs/lakebridge/assessments/profiler_config.py ADDED Viewed

@@ -0,0 +1,30 @@
+from dataclasses import dataclass, field
+@dataclass
+class Step:
+    name: str
+    type: str | None
+    extract_source: str
+    mode: str | None
+    frequency: str | None
+    flag: str | None
+    dependencies: list[str] = field(default_factory=list)
+    comment: str | None = None
+    def __post_init__(self):
+        if self.frequency is None:
+            self.frequency = "once"
+        if self.flag is None:
+            self.flag = "active"
+        if self.mode is None:
+            self.mode = "append"
+@dataclass
+class PipelineConfig:
+    name: str
+    version: str
+    extract_folder: str
+    comment: str | None = None
+    steps: list[Step] = field(default_factory=list)

databricks/labs/lakebridge/base_install.py ADDED Viewed

@@ -0,0 +1,12 @@
+from databricks.labs.blueprint.logger import install_logger
+from databricks.labs.blueprint.entrypoint import get_logger
+from databricks.sdk.core import with_user_agent_extra
+install_logger()
+with_user_agent_extra("cmd", "install")
+if __name__ == "__main__":
+    logger = get_logger(__file__)
+    logger.setLevel("INFO")
+    logger.info("Successfully Setup Remorph Components Locally")