databricks-labs-lakebridge 0.10.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- databricks/__init__.py +3 -0
- databricks/labs/__init__.py +3 -0
- databricks/labs/lakebridge/__about__.py +2 -0
- databricks/labs/lakebridge/__init__.py +11 -0
- databricks/labs/lakebridge/assessments/configure_assessment.py +194 -0
- databricks/labs/lakebridge/assessments/pipeline.py +188 -0
- databricks/labs/lakebridge/assessments/profiler_config.py +30 -0
- databricks/labs/lakebridge/base_install.py +12 -0
- databricks/labs/lakebridge/cli.py +449 -0
- databricks/labs/lakebridge/config.py +192 -0
- databricks/labs/lakebridge/connections/__init__.py +0 -0
- databricks/labs/lakebridge/connections/credential_manager.py +89 -0
- databricks/labs/lakebridge/connections/database_manager.py +98 -0
- databricks/labs/lakebridge/connections/env_getter.py +13 -0
- databricks/labs/lakebridge/contexts/__init__.py +0 -0
- databricks/labs/lakebridge/contexts/application.py +133 -0
- databricks/labs/lakebridge/coverage/__init__.py +0 -0
- databricks/labs/lakebridge/coverage/commons.py +223 -0
- databricks/labs/lakebridge/coverage/lakebridge_snow_transpilation_coverage.py +29 -0
- databricks/labs/lakebridge/coverage/local_report.py +9 -0
- databricks/labs/lakebridge/coverage/sqlglot_snow_transpilation_coverage.py +5 -0
- databricks/labs/lakebridge/coverage/sqlglot_tsql_transpilation_coverage.py +5 -0
- databricks/labs/lakebridge/deployment/__init__.py +0 -0
- databricks/labs/lakebridge/deployment/configurator.py +199 -0
- databricks/labs/lakebridge/deployment/dashboard.py +140 -0
- databricks/labs/lakebridge/deployment/installation.py +125 -0
- databricks/labs/lakebridge/deployment/job.py +147 -0
- databricks/labs/lakebridge/deployment/recon.py +145 -0
- databricks/labs/lakebridge/deployment/table.py +30 -0
- databricks/labs/lakebridge/deployment/upgrade_common.py +124 -0
- databricks/labs/lakebridge/discovery/table.py +36 -0
- databricks/labs/lakebridge/discovery/table_definition.py +23 -0
- databricks/labs/lakebridge/discovery/tsql_table_definition.py +185 -0
- databricks/labs/lakebridge/errors/exceptions.py +1 -0
- databricks/labs/lakebridge/helpers/__init__.py +0 -0
- databricks/labs/lakebridge/helpers/db_sql.py +24 -0
- databricks/labs/lakebridge/helpers/execution_time.py +20 -0
- databricks/labs/lakebridge/helpers/file_utils.py +64 -0
- databricks/labs/lakebridge/helpers/metastore.py +164 -0
- databricks/labs/lakebridge/helpers/recon_config_utils.py +176 -0
- databricks/labs/lakebridge/helpers/string_utils.py +62 -0
- databricks/labs/lakebridge/helpers/telemetry_utils.py +13 -0
- databricks/labs/lakebridge/helpers/validation.py +101 -0
- databricks/labs/lakebridge/install.py +849 -0
- databricks/labs/lakebridge/intermediate/__init__.py +0 -0
- databricks/labs/lakebridge/intermediate/dag.py +88 -0
- databricks/labs/lakebridge/intermediate/engine_adapter.py +0 -0
- databricks/labs/lakebridge/intermediate/root_tables.py +44 -0
- databricks/labs/lakebridge/jvmproxy.py +56 -0
- databricks/labs/lakebridge/lineage.py +42 -0
- databricks/labs/lakebridge/reconcile/__init__.py +0 -0
- databricks/labs/lakebridge/reconcile/compare.py +414 -0
- databricks/labs/lakebridge/reconcile/connectors/__init__.py +0 -0
- databricks/labs/lakebridge/reconcile/connectors/data_source.py +72 -0
- databricks/labs/lakebridge/reconcile/connectors/databricks.py +87 -0
- databricks/labs/lakebridge/reconcile/connectors/jdbc_reader.py +41 -0
- databricks/labs/lakebridge/reconcile/connectors/oracle.py +108 -0
- databricks/labs/lakebridge/reconcile/connectors/secrets.py +30 -0
- databricks/labs/lakebridge/reconcile/connectors/snowflake.py +173 -0
- databricks/labs/lakebridge/reconcile/connectors/source_adapter.py +30 -0
- databricks/labs/lakebridge/reconcile/connectors/sql_server.py +132 -0
- databricks/labs/lakebridge/reconcile/constants.py +37 -0
- databricks/labs/lakebridge/reconcile/exception.py +42 -0
- databricks/labs/lakebridge/reconcile/execute.py +920 -0
- databricks/labs/lakebridge/reconcile/query_builder/__init__.py +0 -0
- databricks/labs/lakebridge/reconcile/query_builder/aggregate_query.py +293 -0
- databricks/labs/lakebridge/reconcile/query_builder/base.py +138 -0
- databricks/labs/lakebridge/reconcile/query_builder/count_query.py +33 -0
- databricks/labs/lakebridge/reconcile/query_builder/expression_generator.py +292 -0
- databricks/labs/lakebridge/reconcile/query_builder/hash_query.py +91 -0
- databricks/labs/lakebridge/reconcile/query_builder/sampling_query.py +123 -0
- databricks/labs/lakebridge/reconcile/query_builder/threshold_query.py +231 -0
- databricks/labs/lakebridge/reconcile/recon_capture.py +635 -0
- databricks/labs/lakebridge/reconcile/recon_config.py +363 -0
- databricks/labs/lakebridge/reconcile/recon_output_config.py +85 -0
- databricks/labs/lakebridge/reconcile/runner.py +97 -0
- databricks/labs/lakebridge/reconcile/sampler.py +239 -0
- databricks/labs/lakebridge/reconcile/schema_compare.py +126 -0
- databricks/labs/lakebridge/resources/__init__.py +0 -0
- databricks/labs/lakebridge/resources/config/credentials.yml +33 -0
- databricks/labs/lakebridge/resources/reconcile/__init__.py +0 -0
- databricks/labs/lakebridge/resources/reconcile/dashboards/__init__.py +0 -0
- databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/00_0_aggregate_recon_header.md +6 -0
- databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/01_0_recon_id.filter.yml +6 -0
- databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/01_1_executed_by.filter.yml +5 -0
- databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/01_2_started_at.filter.yml +5 -0
- databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/02_0_source_type.filter.yml +5 -0
- databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/02_1_source_table.filter.yml +5 -0
- databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/02_2_target_table.filter.yml +5 -0
- databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/04_0_aggregate_summary_table.sql +46 -0
- databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/05_0_aggregate_recon_drilldown_header.md +2 -0
- databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/06_0_recon_id.filter.yml +5 -0
- databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/06_1_category.filter.yml +5 -0
- databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/06_2_aggregate_type.filter.yml +5 -0
- databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/07_0_target_table.filter.yml +4 -0
- databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/07_1_source_table.filter.yml +4 -0
- databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/08_0_aggregate_details_table.sql +92 -0
- databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/09_0_aggregate_missing_mismatch_header.md +1 -0
- databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/10_0_aggr_mismatched_records.sql +19 -0
- databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/11_0_aggr_missing_in_databricks.sql +19 -0
- databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/11_1_aggr_missing_in_source.sql +19 -0
- databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/dashboard.yml +365 -0
- databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/00_0_recon_main.md +3 -0
- databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/01_0_recon_id.filter.yml +6 -0
- databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/01_1_report_type.filter.yml +5 -0
- databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/01_2_executed_by.filter.yml +5 -0
- databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/02_0_source_type.filter.yml +5 -0
- databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/02_1_source_table.filter.yml +6 -0
- databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/02_2_target_table.filter.yml +6 -0
- databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/03_0_started_at.filter.yml +5 -0
- databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/05_0_summary_table.sql +38 -0
- databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/06_0_schema_comparison_header.md +3 -0
- databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/07_0_schema_details_table.sql +42 -0
- databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/08_0_drill_down_header.md +3 -0
- databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/09_0_recon_id.filter.yml +4 -0
- databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/09_1_category.filter.yml +4 -0
- databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/10_0_target_table.filter.yml +4 -0
- databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/10_1_source_table.filter.yml +4 -0
- databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/11_0_recon_details_pivot.sql +40 -0
- databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/12_0_daily_data_validation_issue_header.md +3 -0
- databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/13_0_success_fail_.filter.yml +4 -0
- databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/14_0_failed_recon_ids.sql +15 -0
- databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/15_0_total_failed_runs.sql +10 -0
- databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/15_1_failed_targets.sql +10 -0
- databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/15_2_successful_targets.sql +10 -0
- databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/16_0_missing_mismatch_header.md +1 -0
- databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/17_0_mismatched_records.sql +14 -0
- databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/17_1_threshold_mismatches.sql +14 -0
- databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/18_0_missing_in_databricks.sql +14 -0
- databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/18_1_missing_in_source.sql +14 -0
- databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/dashboard.yml +545 -0
- databricks/labs/lakebridge/resources/reconcile/queries/__init__.py +0 -0
- databricks/labs/lakebridge/resources/reconcile/queries/installation/__init__.py +0 -0
- databricks/labs/lakebridge/resources/reconcile/queries/installation/aggregate_details.sql +7 -0
- databricks/labs/lakebridge/resources/reconcile/queries/installation/aggregate_metrics.sql +15 -0
- databricks/labs/lakebridge/resources/reconcile/queries/installation/aggregate_rules.sql +6 -0
- databricks/labs/lakebridge/resources/reconcile/queries/installation/details.sql +7 -0
- databricks/labs/lakebridge/resources/reconcile/queries/installation/main.sql +24 -0
- databricks/labs/lakebridge/resources/reconcile/queries/installation/metrics.sql +21 -0
- databricks/labs/lakebridge/transpiler/__init__.py +0 -0
- databricks/labs/lakebridge/transpiler/execute.py +423 -0
- databricks/labs/lakebridge/transpiler/lsp/__init__.py +0 -0
- databricks/labs/lakebridge/transpiler/lsp/lsp_engine.py +564 -0
- databricks/labs/lakebridge/transpiler/sqlglot/__init__.py +0 -0
- databricks/labs/lakebridge/transpiler/sqlglot/dialect_utils.py +30 -0
- databricks/labs/lakebridge/transpiler/sqlglot/generator/__init__.py +0 -0
- databricks/labs/lakebridge/transpiler/sqlglot/generator/databricks.py +771 -0
- databricks/labs/lakebridge/transpiler/sqlglot/lca_utils.py +138 -0
- databricks/labs/lakebridge/transpiler/sqlglot/local_expression.py +197 -0
- databricks/labs/lakebridge/transpiler/sqlglot/parsers/__init__.py +0 -0
- databricks/labs/lakebridge/transpiler/sqlglot/parsers/oracle.py +23 -0
- databricks/labs/lakebridge/transpiler/sqlglot/parsers/presto.py +202 -0
- databricks/labs/lakebridge/transpiler/sqlglot/parsers/snowflake.py +535 -0
- databricks/labs/lakebridge/transpiler/sqlglot/sqlglot_engine.py +203 -0
- databricks/labs/lakebridge/transpiler/transpile_engine.py +49 -0
- databricks/labs/lakebridge/transpiler/transpile_status.py +68 -0
- databricks/labs/lakebridge/uninstall.py +28 -0
- databricks/labs/lakebridge/upgrades/v0.4.0_add_main_table_operation_name_column.py +80 -0
- databricks/labs/lakebridge/upgrades/v0.6.0_alter_metrics_datatype.py +51 -0
- databricks_labs_lakebridge-0.10.0.dist-info/METADATA +58 -0
- databricks_labs_lakebridge-0.10.0.dist-info/RECORD +171 -0
- databricks_labs_lakebridge-0.10.0.dist-info/WHEEL +4 -0
- databricks_labs_lakebridge-0.10.0.dist-info/entry_points.txt +2 -0
- databricks_labs_lakebridge-0.10.0.dist-info/licenses/LICENSE +69 -0
- databricks_labs_lakebridge-0.10.0.dist-info/licenses/NOTICE +42 -0
- docs/lakebridge/src/components/Button.tsx +81 -0
- docs/lakebridge/src/css/custom.css +167 -0
- docs/lakebridge/src/css/table.css +20 -0
- docs/lakebridge/src/pages/index.tsx +57 -0
- docs/lakebridge/src/theme/Footer/index.tsx +24 -0
- docs/lakebridge/src/theme/Layout/index.tsx +18 -0
@@ -0,0 +1,72 @@
|
|
1
|
+
import logging
|
2
|
+
from abc import ABC, abstractmethod
|
3
|
+
|
4
|
+
from pyspark.sql import DataFrame
|
5
|
+
|
6
|
+
from databricks.labs.lakebridge.reconcile.exception import DataSourceRuntimeException
|
7
|
+
from databricks.labs.lakebridge.reconcile.recon_config import JdbcReaderOptions, Schema
|
8
|
+
|
9
|
+
logger = logging.getLogger(__name__)
|
10
|
+
|
11
|
+
|
12
|
+
class DataSource(ABC):
|
13
|
+
|
14
|
+
@abstractmethod
|
15
|
+
def read_data(
|
16
|
+
self,
|
17
|
+
catalog: str | None,
|
18
|
+
schema: str,
|
19
|
+
table: str,
|
20
|
+
query: str,
|
21
|
+
options: JdbcReaderOptions | None,
|
22
|
+
) -> DataFrame:
|
23
|
+
return NotImplemented
|
24
|
+
|
25
|
+
@abstractmethod
|
26
|
+
def get_schema(
|
27
|
+
self,
|
28
|
+
catalog: str | None,
|
29
|
+
schema: str,
|
30
|
+
table: str,
|
31
|
+
) -> list[Schema]:
|
32
|
+
return NotImplemented
|
33
|
+
|
34
|
+
@classmethod
|
35
|
+
def log_and_throw_exception(cls, exception: Exception, fetch_type: str, query: str):
|
36
|
+
error_msg = f"Runtime exception occurred while fetching {fetch_type} using {query} : {exception}"
|
37
|
+
logger.warning(error_msg)
|
38
|
+
raise DataSourceRuntimeException(error_msg) from exception
|
39
|
+
|
40
|
+
|
41
|
+
class MockDataSource(DataSource):
|
42
|
+
|
43
|
+
def __init__(
|
44
|
+
self,
|
45
|
+
dataframe_repository: dict[tuple[str, str, str], DataFrame],
|
46
|
+
schema_repository: dict[tuple[str, str, str], list[Schema]],
|
47
|
+
exception: Exception = RuntimeError("Mock Exception"),
|
48
|
+
):
|
49
|
+
self._dataframe_repository: dict[tuple[str, str, str], DataFrame] = dataframe_repository
|
50
|
+
self._schema_repository: dict[tuple[str, str, str], list[Schema]] = schema_repository
|
51
|
+
self._exception = exception
|
52
|
+
|
53
|
+
def read_data(
|
54
|
+
self,
|
55
|
+
catalog: str | None,
|
56
|
+
schema: str,
|
57
|
+
table: str,
|
58
|
+
query: str,
|
59
|
+
options: JdbcReaderOptions | None,
|
60
|
+
) -> DataFrame:
|
61
|
+
catalog_str = catalog if catalog else ""
|
62
|
+
mock_df = self._dataframe_repository.get((catalog_str, schema, query))
|
63
|
+
if not mock_df:
|
64
|
+
return self.log_and_throw_exception(self._exception, "data", f"({catalog}, {schema}, {query})")
|
65
|
+
return mock_df
|
66
|
+
|
67
|
+
def get_schema(self, catalog: str | None, schema: str, table: str) -> list[Schema]:
|
68
|
+
catalog_str = catalog if catalog else ""
|
69
|
+
mock_schema = self._schema_repository.get((catalog_str, schema, table))
|
70
|
+
if not mock_schema:
|
71
|
+
return self.log_and_throw_exception(self._exception, "schema", f"({catalog}, {schema}, {table})")
|
72
|
+
return mock_schema
|
@@ -0,0 +1,87 @@
|
|
1
|
+
import logging
|
2
|
+
import re
|
3
|
+
from datetime import datetime
|
4
|
+
|
5
|
+
from pyspark.errors import PySparkException
|
6
|
+
from pyspark.sql import DataFrame, SparkSession
|
7
|
+
from pyspark.sql.functions import col
|
8
|
+
from sqlglot import Dialect
|
9
|
+
|
10
|
+
from databricks.labs.lakebridge.reconcile.connectors.data_source import DataSource
|
11
|
+
from databricks.labs.lakebridge.reconcile.connectors.secrets import SecretsMixin
|
12
|
+
from databricks.labs.lakebridge.reconcile.recon_config import JdbcReaderOptions, Schema
|
13
|
+
from databricks.sdk import WorkspaceClient
|
14
|
+
|
15
|
+
logger = logging.getLogger(__name__)
|
16
|
+
|
17
|
+
|
18
|
+
def _get_schema_query(catalog: str, schema: str, table: str):
|
19
|
+
# TODO: Ensure that the target_catalog in the configuration is not set to "hive_metastore". The source_catalog
|
20
|
+
# can only be set to "hive_metastore" if the source type is "databricks".
|
21
|
+
if schema == "global_temp":
|
22
|
+
return f"describe table global_temp.{table}"
|
23
|
+
if catalog == "hive_metastore":
|
24
|
+
return f"describe table {catalog}.{schema}.{table}"
|
25
|
+
|
26
|
+
query = f"""select
|
27
|
+
lower(column_name) as col_name,
|
28
|
+
full_data_type as data_type
|
29
|
+
from {catalog}.information_schema.columns
|
30
|
+
where lower(table_catalog)='{catalog}'
|
31
|
+
and lower(table_schema)='{schema}'
|
32
|
+
and lower(table_name) ='{table}'
|
33
|
+
order by col_name"""
|
34
|
+
return re.sub(r'\s+', ' ', query)
|
35
|
+
|
36
|
+
|
37
|
+
class DatabricksDataSource(DataSource, SecretsMixin):
|
38
|
+
|
39
|
+
def __init__(
|
40
|
+
self,
|
41
|
+
engine: Dialect,
|
42
|
+
spark: SparkSession,
|
43
|
+
ws: WorkspaceClient,
|
44
|
+
secret_scope: str,
|
45
|
+
):
|
46
|
+
self._engine = engine
|
47
|
+
self._spark = spark
|
48
|
+
self._ws = ws
|
49
|
+
self._secret_scope = secret_scope
|
50
|
+
|
51
|
+
def read_data(
|
52
|
+
self,
|
53
|
+
catalog: str | None,
|
54
|
+
schema: str,
|
55
|
+
table: str,
|
56
|
+
query: str,
|
57
|
+
options: JdbcReaderOptions | None,
|
58
|
+
) -> DataFrame:
|
59
|
+
namespace_catalog = "hive_metastore" if not catalog else catalog
|
60
|
+
if schema == "global_temp":
|
61
|
+
namespace_catalog = "global_temp"
|
62
|
+
else:
|
63
|
+
namespace_catalog = f"{namespace_catalog}.{schema}"
|
64
|
+
table_with_namespace = f"{namespace_catalog}.{table}"
|
65
|
+
table_query = query.replace(":tbl", table_with_namespace)
|
66
|
+
try:
|
67
|
+
df = self._spark.sql(table_query)
|
68
|
+
return df.select([col(column).alias(column.lower()) for column in df.columns])
|
69
|
+
except (RuntimeError, PySparkException) as e:
|
70
|
+
return self.log_and_throw_exception(e, "data", table_query)
|
71
|
+
|
72
|
+
def get_schema(
|
73
|
+
self,
|
74
|
+
catalog: str | None,
|
75
|
+
schema: str,
|
76
|
+
table: str,
|
77
|
+
) -> list[Schema]:
|
78
|
+
catalog_str = catalog if catalog else "hive_metastore"
|
79
|
+
schema_query = _get_schema_query(catalog_str, schema, table)
|
80
|
+
try:
|
81
|
+
logger.debug(f"Fetching schema using query: \n`{schema_query}`")
|
82
|
+
logger.info(f"Fetching Schema: Started at: {datetime.now()}")
|
83
|
+
schema_metadata = self._spark.sql(schema_query).where("col_name not like '#%'").distinct().collect()
|
84
|
+
logger.info(f"Schema fetched successfully. Completed at: {datetime.now()}")
|
85
|
+
return [Schema(field.col_name.lower(), field.data_type.lower()) for field in schema_metadata]
|
86
|
+
except (RuntimeError, PySparkException) as e:
|
87
|
+
return self.log_and_throw_exception(e, "schema", schema_query)
|
@@ -0,0 +1,41 @@
|
|
1
|
+
from typing import Any
|
2
|
+
|
3
|
+
from pyspark.sql import SparkSession
|
4
|
+
|
5
|
+
from databricks.labs.lakebridge.reconcile.recon_config import JdbcReaderOptions
|
6
|
+
|
7
|
+
|
8
|
+
class JDBCReaderMixin:
|
9
|
+
_spark: SparkSession
|
10
|
+
|
11
|
+
# TODO update the url
|
12
|
+
def _get_jdbc_reader(self, query, jdbc_url, driver, prepare_query=None):
|
13
|
+
driver_class = {
|
14
|
+
"oracle": "oracle.jdbc.driver.OracleDriver",
|
15
|
+
"snowflake": "net.snowflake.client.jdbc.SnowflakeDriver",
|
16
|
+
"sqlserver": "com.microsoft.sqlserver.jdbc.SQLServerDriver",
|
17
|
+
}
|
18
|
+
reader = (
|
19
|
+
self._spark.read.format("jdbc")
|
20
|
+
.option("url", jdbc_url)
|
21
|
+
.option("driver", driver_class.get(driver, driver))
|
22
|
+
.option("dbtable", f"({query}) tmp")
|
23
|
+
)
|
24
|
+
if prepare_query is not None:
|
25
|
+
reader = reader.option('prepareQuery', prepare_query)
|
26
|
+
return reader
|
27
|
+
|
28
|
+
@staticmethod
|
29
|
+
def _get_jdbc_reader_options(options: JdbcReaderOptions):
|
30
|
+
option_dict: dict[str, Any] = {}
|
31
|
+
if options.number_partitions:
|
32
|
+
option_dict["numPartitions"] = options.number_partitions
|
33
|
+
if options.partition_column:
|
34
|
+
option_dict["partitionColumn"] = options.partition_column
|
35
|
+
if options.lower_bound:
|
36
|
+
option_dict["lowerBound"] = options.lower_bound
|
37
|
+
if options.upper_bound:
|
38
|
+
option_dict["upperBound"] = options.upper_bound
|
39
|
+
if options.fetch_size:
|
40
|
+
option_dict["fetchsize"] = options.fetch_size
|
41
|
+
return option_dict
|
@@ -0,0 +1,108 @@
|
|
1
|
+
import re
|
2
|
+
import logging
|
3
|
+
from datetime import datetime
|
4
|
+
|
5
|
+
from pyspark.errors import PySparkException
|
6
|
+
from pyspark.sql import DataFrame, DataFrameReader, SparkSession
|
7
|
+
from pyspark.sql.functions import col
|
8
|
+
from sqlglot import Dialect
|
9
|
+
|
10
|
+
from databricks.labs.lakebridge.reconcile.connectors.data_source import DataSource
|
11
|
+
from databricks.labs.lakebridge.reconcile.connectors.jdbc_reader import JDBCReaderMixin
|
12
|
+
from databricks.labs.lakebridge.reconcile.connectors.secrets import SecretsMixin
|
13
|
+
from databricks.labs.lakebridge.reconcile.recon_config import JdbcReaderOptions, Schema
|
14
|
+
from databricks.sdk import WorkspaceClient
|
15
|
+
|
16
|
+
logger = logging.getLogger(__name__)
|
17
|
+
|
18
|
+
|
19
|
+
class OracleDataSource(DataSource, SecretsMixin, JDBCReaderMixin):
|
20
|
+
_DRIVER = "oracle"
|
21
|
+
_SCHEMA_QUERY = """select column_name, case when (data_precision is not null
|
22
|
+
and data_scale <> 0)
|
23
|
+
then data_type || '(' || data_precision || ',' || data_scale || ')'
|
24
|
+
when (data_precision is not null and data_scale = 0)
|
25
|
+
then data_type || '(' || data_precision || ')'
|
26
|
+
when data_precision is null and (lower(data_type) in ('date') or
|
27
|
+
lower(data_type) like 'timestamp%') then data_type
|
28
|
+
when CHAR_LENGTH = 0 then data_type
|
29
|
+
else data_type || '(' || CHAR_LENGTH || ')'
|
30
|
+
end data_type
|
31
|
+
FROM ALL_TAB_COLUMNS
|
32
|
+
WHERE lower(TABLE_NAME) = '{table}' and lower(owner) = '{owner}'"""
|
33
|
+
|
34
|
+
def __init__(
|
35
|
+
self,
|
36
|
+
engine: Dialect,
|
37
|
+
spark: SparkSession,
|
38
|
+
ws: WorkspaceClient,
|
39
|
+
secret_scope: str,
|
40
|
+
):
|
41
|
+
self._engine = engine
|
42
|
+
self._spark = spark
|
43
|
+
self._ws = ws
|
44
|
+
self._secret_scope = secret_scope
|
45
|
+
|
46
|
+
@property
|
47
|
+
def get_jdbc_url(self) -> str:
|
48
|
+
return (
|
49
|
+
f"jdbc:{OracleDataSource._DRIVER}:thin:{self._get_secret('user')}"
|
50
|
+
f"/{self._get_secret('password')}@//{self._get_secret('host')}"
|
51
|
+
f":{self._get_secret('port')}/{self._get_secret('database')}"
|
52
|
+
)
|
53
|
+
|
54
|
+
def read_data(
|
55
|
+
self,
|
56
|
+
catalog: str | None,
|
57
|
+
schema: str,
|
58
|
+
table: str,
|
59
|
+
query: str,
|
60
|
+
options: JdbcReaderOptions | None,
|
61
|
+
) -> DataFrame:
|
62
|
+
table_query = query.replace(":tbl", f"{schema}.{table}")
|
63
|
+
try:
|
64
|
+
if options is None:
|
65
|
+
return self.reader(table_query).options(**self._get_timestamp_options()).load()
|
66
|
+
reader_options = self._get_jdbc_reader_options(options) | self._get_timestamp_options()
|
67
|
+
df = self.reader(table_query).options(**reader_options).load()
|
68
|
+
logger.warning(f"Fetching data using query: \n`{table_query}`")
|
69
|
+
|
70
|
+
# Convert all column names to lower case
|
71
|
+
df = df.select([col(c).alias(c.lower()) for c in df.columns])
|
72
|
+
return df
|
73
|
+
except (RuntimeError, PySparkException) as e:
|
74
|
+
return self.log_and_throw_exception(e, "data", table_query)
|
75
|
+
|
76
|
+
def get_schema(
|
77
|
+
self,
|
78
|
+
catalog: str | None,
|
79
|
+
schema: str,
|
80
|
+
table: str,
|
81
|
+
) -> list[Schema]:
|
82
|
+
schema_query = re.sub(
|
83
|
+
r'\s+',
|
84
|
+
' ',
|
85
|
+
OracleDataSource._SCHEMA_QUERY.format(table=table, owner=schema),
|
86
|
+
)
|
87
|
+
try:
|
88
|
+
logger.debug(f"Fetching schema using query: \n`{schema_query}`")
|
89
|
+
logger.info(f"Fetching Schema: Started at: {datetime.now()}")
|
90
|
+
df = self.reader(schema_query).load()
|
91
|
+
schema_metadata = df.select([col(c).alias(c.lower()) for c in df.columns]).collect()
|
92
|
+
logger.info(f"Schema fetched successfully. Completed at: {datetime.now()}")
|
93
|
+
logger.debug(f"schema_metadata: ${schema_metadata}")
|
94
|
+
return [Schema(field.column_name.lower(), field.data_type.lower()) for field in schema_metadata]
|
95
|
+
except (RuntimeError, PySparkException) as e:
|
96
|
+
return self.log_and_throw_exception(e, "schema", schema_query)
|
97
|
+
|
98
|
+
@staticmethod
|
99
|
+
def _get_timestamp_options() -> dict[str, str]:
|
100
|
+
return {
|
101
|
+
"oracle.jdbc.mapDateToTimestamp": "False",
|
102
|
+
"sessionInitStatement": "BEGIN dbms_session.set_nls('nls_date_format', "
|
103
|
+
"'''YYYY-MM-DD''');dbms_session.set_nls('nls_timestamp_format', '''YYYY-MM-DD "
|
104
|
+
"HH24:MI:SS''');END;",
|
105
|
+
}
|
106
|
+
|
107
|
+
def reader(self, query: str) -> DataFrameReader:
|
108
|
+
return self._get_jdbc_reader(query, self.get_jdbc_url, OracleDataSource._DRIVER)
|
@@ -0,0 +1,30 @@
|
|
1
|
+
import base64
|
2
|
+
import logging
|
3
|
+
|
4
|
+
from databricks.sdk import WorkspaceClient
|
5
|
+
from databricks.sdk.errors import NotFound
|
6
|
+
|
7
|
+
logger = logging.getLogger(__name__)
|
8
|
+
|
9
|
+
|
10
|
+
class SecretsMixin:
|
11
|
+
_ws: WorkspaceClient
|
12
|
+
_secret_scope: str
|
13
|
+
|
14
|
+
def _get_secret(self, secret_key: str) -> str:
|
15
|
+
"""Get the secret value given a secret scope & secret key. Log a warning if secret does not exist"""
|
16
|
+
try:
|
17
|
+
# Return the decoded secret value in string format
|
18
|
+
secret = self._ws.secrets.get_secret(self._secret_scope, secret_key)
|
19
|
+
assert secret.value is not None
|
20
|
+
return base64.b64decode(secret.value).decode("utf-8")
|
21
|
+
except NotFound as e:
|
22
|
+
raise NotFound(f'Secret does not exist with scope: {self._secret_scope} and key: {secret_key} : {e}') from e
|
23
|
+
except UnicodeDecodeError as e:
|
24
|
+
raise UnicodeDecodeError(
|
25
|
+
"utf-8",
|
26
|
+
secret_key.encode(),
|
27
|
+
0,
|
28
|
+
1,
|
29
|
+
f"Secret {self._secret_scope}/{secret_key} has Base64 bytes that cannot be decoded to utf-8 string: {e}.",
|
30
|
+
) from e
|
@@ -0,0 +1,173 @@
|
|
1
|
+
import logging
|
2
|
+
import re
|
3
|
+
from datetime import datetime
|
4
|
+
|
5
|
+
from pyspark.errors import PySparkException
|
6
|
+
from pyspark.sql import DataFrame, DataFrameReader, SparkSession
|
7
|
+
from pyspark.sql.functions import col
|
8
|
+
from sqlglot import Dialect
|
9
|
+
from cryptography.hazmat.backends import default_backend
|
10
|
+
from cryptography.hazmat.primitives import serialization
|
11
|
+
|
12
|
+
from databricks.labs.lakebridge.reconcile.connectors.data_source import DataSource
|
13
|
+
from databricks.labs.lakebridge.reconcile.connectors.jdbc_reader import JDBCReaderMixin
|
14
|
+
from databricks.labs.lakebridge.reconcile.connectors.secrets import SecretsMixin
|
15
|
+
from databricks.labs.lakebridge.reconcile.exception import InvalidSnowflakePemPrivateKey
|
16
|
+
from databricks.labs.lakebridge.reconcile.recon_config import JdbcReaderOptions, Schema
|
17
|
+
from databricks.sdk import WorkspaceClient
|
18
|
+
from databricks.sdk.errors import NotFound
|
19
|
+
|
20
|
+
logger = logging.getLogger(__name__)
|
21
|
+
|
22
|
+
|
23
|
+
class SnowflakeDataSource(DataSource, SecretsMixin, JDBCReaderMixin):
|
24
|
+
_DRIVER = "snowflake"
|
25
|
+
"""
|
26
|
+
* INFORMATION_SCHEMA:
|
27
|
+
- see https://docs.snowflake.com/en/sql-reference/info-schema#considerations-for-replacing-show-commands-with-information-schema-views
|
28
|
+
* DATA:
|
29
|
+
- only unquoted identifiers are treated as case-insensitive and are stored in uppercase.
|
30
|
+
- for quoted identifiers refer:
|
31
|
+
https://docs.snowflake.com/en/sql-reference/identifiers-syntax#double-quoted-identifiers
|
32
|
+
* ORDINAL_POSITION:
|
33
|
+
- indicates the sequential order of a column within a table or view,
|
34
|
+
starting from 1 based on the order of column definition.
|
35
|
+
"""
|
36
|
+
_SCHEMA_QUERY = """select column_name,
|
37
|
+
case
|
38
|
+
when numeric_precision is not null and numeric_scale is not null
|
39
|
+
then
|
40
|
+
concat(data_type, '(', numeric_precision, ',' , numeric_scale, ')')
|
41
|
+
when lower(data_type) = 'text'
|
42
|
+
then
|
43
|
+
concat('varchar', '(', CHARACTER_MAXIMUM_LENGTH, ')')
|
44
|
+
else data_type
|
45
|
+
end as data_type
|
46
|
+
from {catalog}.INFORMATION_SCHEMA.COLUMNS
|
47
|
+
where lower(table_name)='{table}' and table_schema = '{schema}'
|
48
|
+
order by ordinal_position"""
|
49
|
+
|
50
|
+
def __init__(
|
51
|
+
self,
|
52
|
+
engine: Dialect,
|
53
|
+
spark: SparkSession,
|
54
|
+
ws: WorkspaceClient,
|
55
|
+
secret_scope: str,
|
56
|
+
):
|
57
|
+
self._engine = engine
|
58
|
+
self._spark = spark
|
59
|
+
self._ws = ws
|
60
|
+
self._secret_scope = secret_scope
|
61
|
+
|
62
|
+
@property
|
63
|
+
def get_jdbc_url(self) -> str:
|
64
|
+
try:
|
65
|
+
sf_password = self._get_secret('sfPassword')
|
66
|
+
except (NotFound, KeyError) as e:
|
67
|
+
message = "sfPassword is mandatory for jdbc connectivity with Snowflake."
|
68
|
+
logger.error(message)
|
69
|
+
raise NotFound(message) from e
|
70
|
+
|
71
|
+
return (
|
72
|
+
f"jdbc:{SnowflakeDataSource._DRIVER}://{self._get_secret('sfAccount')}.snowflakecomputing.com"
|
73
|
+
f"/?user={self._get_secret('sfUser')}&password={sf_password}"
|
74
|
+
f"&db={self._get_secret('sfDatabase')}&schema={self._get_secret('sfSchema')}"
|
75
|
+
f"&warehouse={self._get_secret('sfWarehouse')}&role={self._get_secret('sfRole')}"
|
76
|
+
)
|
77
|
+
|
78
|
+
@staticmethod
|
79
|
+
def get_private_key(pem_private_key: str) -> str:
|
80
|
+
try:
|
81
|
+
private_key_bytes = pem_private_key.encode("UTF-8")
|
82
|
+
p_key = serialization.load_pem_private_key(
|
83
|
+
private_key_bytes,
|
84
|
+
password=None,
|
85
|
+
backend=default_backend(),
|
86
|
+
)
|
87
|
+
pkb = p_key.private_bytes(
|
88
|
+
encoding=serialization.Encoding.PEM,
|
89
|
+
format=serialization.PrivateFormat.PKCS8,
|
90
|
+
encryption_algorithm=serialization.NoEncryption(),
|
91
|
+
)
|
92
|
+
pkb_str = pkb.decode("UTF-8")
|
93
|
+
# Remove the first and last lines (BEGIN/END markers)
|
94
|
+
private_key_pem_lines = pkb_str.strip().split('\n')[1:-1]
|
95
|
+
# Join the lines to form the base64 encoded string
|
96
|
+
private_key_pem_str = ''.join(private_key_pem_lines)
|
97
|
+
return private_key_pem_str
|
98
|
+
except Exception as e:
|
99
|
+
message = f"Failed to load or process the provided PEM private key. --> {e}"
|
100
|
+
logger.error(message)
|
101
|
+
raise InvalidSnowflakePemPrivateKey(message) from e
|
102
|
+
|
103
|
+
def read_data(
|
104
|
+
self,
|
105
|
+
catalog: str | None,
|
106
|
+
schema: str,
|
107
|
+
table: str,
|
108
|
+
query: str,
|
109
|
+
options: JdbcReaderOptions | None,
|
110
|
+
) -> DataFrame:
|
111
|
+
table_query = query.replace(":tbl", f"{catalog}.{schema}.{table}")
|
112
|
+
try:
|
113
|
+
if options is None:
|
114
|
+
df = self.reader(table_query).load()
|
115
|
+
else:
|
116
|
+
options = self._get_jdbc_reader_options(options)
|
117
|
+
df = (
|
118
|
+
self._get_jdbc_reader(table_query, self.get_jdbc_url, SnowflakeDataSource._DRIVER)
|
119
|
+
.options(**options)
|
120
|
+
.load()
|
121
|
+
)
|
122
|
+
return df.select([col(column).alias(column.lower()) for column in df.columns])
|
123
|
+
except (RuntimeError, PySparkException) as e:
|
124
|
+
return self.log_and_throw_exception(e, "data", table_query)
|
125
|
+
|
126
|
+
def get_schema(
|
127
|
+
self,
|
128
|
+
catalog: str | None,
|
129
|
+
schema: str,
|
130
|
+
table: str,
|
131
|
+
) -> list[Schema]:
|
132
|
+
"""
|
133
|
+
Fetch the Schema from the INFORMATION_SCHEMA.COLUMNS table in Snowflake.
|
134
|
+
|
135
|
+
If the user's current role does not have the necessary privileges to access the specified
|
136
|
+
Information Schema object, RunTimeError will be raised:
|
137
|
+
"SQL access control error: Insufficient privileges to operate on schema 'INFORMATION_SCHEMA' "
|
138
|
+
"""
|
139
|
+
schema_query = re.sub(
|
140
|
+
r'\s+',
|
141
|
+
' ',
|
142
|
+
SnowflakeDataSource._SCHEMA_QUERY.format(catalog=catalog, schema=schema.upper(), table=table),
|
143
|
+
)
|
144
|
+
try:
|
145
|
+
logger.debug(f"Fetching schema using query: \n`{schema_query}`")
|
146
|
+
logger.info(f"Fetching Schema: Started at: {datetime.now()}")
|
147
|
+
schema_metadata = self.reader(schema_query).load().collect()
|
148
|
+
logger.info(f"Schema fetched successfully. Completed at: {datetime.now()}")
|
149
|
+
return [Schema(field.COLUMN_NAME.lower(), field.DATA_TYPE.lower()) for field in schema_metadata]
|
150
|
+
except (RuntimeError, PySparkException) as e:
|
151
|
+
return self.log_and_throw_exception(e, "schema", schema_query)
|
152
|
+
|
153
|
+
def reader(self, query: str) -> DataFrameReader:
|
154
|
+
options = {
|
155
|
+
"sfUrl": self._get_secret('sfUrl'),
|
156
|
+
"sfUser": self._get_secret('sfUser'),
|
157
|
+
"sfDatabase": self._get_secret('sfDatabase'),
|
158
|
+
"sfSchema": self._get_secret('sfSchema'),
|
159
|
+
"sfWarehouse": self._get_secret('sfWarehouse'),
|
160
|
+
"sfRole": self._get_secret('sfRole'),
|
161
|
+
}
|
162
|
+
try:
|
163
|
+
options["pem_private_key"] = SnowflakeDataSource.get_private_key(self._get_secret('pem_private_key'))
|
164
|
+
except (NotFound, KeyError):
|
165
|
+
logger.warning("pem_private_key not found. Checking for sfPassword")
|
166
|
+
try:
|
167
|
+
options["sfPassword"] = self._get_secret('sfPassword')
|
168
|
+
except (NotFound, KeyError) as e:
|
169
|
+
message = "sfPassword and pem_private_key not found. Either one is required for snowflake auth."
|
170
|
+
logger.error(message)
|
171
|
+
raise NotFound(message) from e
|
172
|
+
|
173
|
+
return self._spark.read.format("snowflake").option("dbtable", f"({query}) as tmp").options(**options)
|
@@ -0,0 +1,30 @@
|
|
1
|
+
from pyspark.sql import SparkSession
|
2
|
+
from sqlglot import Dialect
|
3
|
+
from sqlglot.dialects import TSQL
|
4
|
+
|
5
|
+
from databricks.labs.lakebridge.reconcile.connectors.data_source import DataSource
|
6
|
+
from databricks.labs.lakebridge.reconcile.connectors.databricks import DatabricksDataSource
|
7
|
+
from databricks.labs.lakebridge.reconcile.connectors.oracle import OracleDataSource
|
8
|
+
from databricks.labs.lakebridge.reconcile.connectors.snowflake import SnowflakeDataSource
|
9
|
+
from databricks.labs.lakebridge.reconcile.connectors.sql_server import SQLServerDataSource
|
10
|
+
from databricks.labs.lakebridge.transpiler.sqlglot.generator.databricks import Databricks
|
11
|
+
from databricks.labs.lakebridge.transpiler.sqlglot.parsers.oracle import Oracle
|
12
|
+
from databricks.labs.lakebridge.transpiler.sqlglot.parsers.snowflake import Snowflake
|
13
|
+
from databricks.sdk import WorkspaceClient
|
14
|
+
|
15
|
+
|
16
|
+
def create_adapter(
|
17
|
+
engine: Dialect,
|
18
|
+
spark: SparkSession,
|
19
|
+
ws: WorkspaceClient,
|
20
|
+
secret_scope: str,
|
21
|
+
) -> DataSource:
|
22
|
+
if isinstance(engine, Snowflake):
|
23
|
+
return SnowflakeDataSource(engine, spark, ws, secret_scope)
|
24
|
+
if isinstance(engine, Oracle):
|
25
|
+
return OracleDataSource(engine, spark, ws, secret_scope)
|
26
|
+
if isinstance(engine, Databricks):
|
27
|
+
return DatabricksDataSource(engine, spark, ws, secret_scope)
|
28
|
+
if isinstance(engine, TSQL):
|
29
|
+
return SQLServerDataSource(engine, spark, ws, secret_scope)
|
30
|
+
raise ValueError(f"Unsupported source type --> {engine}")
|
@@ -0,0 +1,132 @@
|
|
1
|
+
import re
|
2
|
+
import logging
|
3
|
+
from datetime import datetime
|
4
|
+
|
5
|
+
from pyspark.errors import PySparkException
|
6
|
+
from pyspark.sql import DataFrame, DataFrameReader, SparkSession
|
7
|
+
from pyspark.sql.functions import col
|
8
|
+
from sqlglot import Dialect
|
9
|
+
|
10
|
+
from databricks.labs.lakebridge.reconcile.connectors.data_source import DataSource
|
11
|
+
from databricks.labs.lakebridge.reconcile.connectors.jdbc_reader import JDBCReaderMixin
|
12
|
+
from databricks.labs.lakebridge.reconcile.connectors.secrets import SecretsMixin
|
13
|
+
from databricks.labs.lakebridge.reconcile.recon_config import JdbcReaderOptions, Schema
|
14
|
+
from databricks.sdk import WorkspaceClient
|
15
|
+
|
16
|
+
logger = logging.getLogger(__name__)
|
17
|
+
|
18
|
+
_SCHEMA_QUERY = """SELECT
|
19
|
+
COLUMN_NAME,
|
20
|
+
CASE
|
21
|
+
WHEN DATA_TYPE IN ('int', 'bigint')
|
22
|
+
THEN DATA_TYPE
|
23
|
+
WHEN DATA_TYPE IN ('smallint', 'tinyint')
|
24
|
+
THEN 'smallint'
|
25
|
+
WHEN DATA_TYPE IN ('decimal' ,'numeric')
|
26
|
+
THEN 'decimal(' +
|
27
|
+
CAST(NUMERIC_PRECISION AS VARCHAR) + ',' +
|
28
|
+
CAST(NUMERIC_SCALE AS VARCHAR) + ')'
|
29
|
+
WHEN DATA_TYPE IN ('float', 'real')
|
30
|
+
THEN 'double'
|
31
|
+
WHEN CHARACTER_MAXIMUM_LENGTH IS NOT NULL AND DATA_TYPE IN ('varchar','char','text','nchar','nvarchar','ntext')
|
32
|
+
THEN DATA_TYPE
|
33
|
+
WHEN DATA_TYPE IN ('date','time','datetime', 'datetime2','smalldatetime','datetimeoffset')
|
34
|
+
THEN DATA_TYPE
|
35
|
+
WHEN DATA_TYPE IN ('bit')
|
36
|
+
THEN 'boolean'
|
37
|
+
WHEN DATA_TYPE IN ('binary','varbinary')
|
38
|
+
THEN 'binary'
|
39
|
+
ELSE DATA_TYPE
|
40
|
+
END AS 'DATA_TYPE'
|
41
|
+
FROM
|
42
|
+
INFORMATION_SCHEMA.COLUMNS
|
43
|
+
WHERE
|
44
|
+
LOWER(TABLE_NAME) = LOWER('{table}')
|
45
|
+
AND LOWER(TABLE_SCHEMA) = LOWER('{schema}')
|
46
|
+
AND LOWER(TABLE_CATALOG) = LOWER('{catalog}')
|
47
|
+
"""
|
48
|
+
|
49
|
+
|
50
|
+
class SQLServerDataSource(DataSource, SecretsMixin, JDBCReaderMixin):
|
51
|
+
_DRIVER = "sqlserver"
|
52
|
+
|
53
|
+
def __init__(
|
54
|
+
self,
|
55
|
+
engine: Dialect,
|
56
|
+
spark: SparkSession,
|
57
|
+
ws: WorkspaceClient,
|
58
|
+
secret_scope: str,
|
59
|
+
):
|
60
|
+
self._engine = engine
|
61
|
+
self._spark = spark
|
62
|
+
self._ws = ws
|
63
|
+
self._secret_scope = secret_scope
|
64
|
+
|
65
|
+
@property
|
66
|
+
def get_jdbc_url(self) -> str:
|
67
|
+
# Construct the JDBC URL
|
68
|
+
return (
|
69
|
+
f"jdbc:{self._DRIVER}://{self._get_secret('host')}:{self._get_secret('port')};"
|
70
|
+
f"databaseName={self._get_secret('database')};"
|
71
|
+
f"user={self._get_secret('user')};"
|
72
|
+
f"password={self._get_secret('password')};"
|
73
|
+
f"encrypt={self._get_secret('encrypt')};"
|
74
|
+
f"trustServerCertificate={self._get_secret('trustServerCertificate')};"
|
75
|
+
)
|
76
|
+
|
77
|
+
def read_data(
|
78
|
+
self,
|
79
|
+
catalog: str | None,
|
80
|
+
schema: str,
|
81
|
+
table: str,
|
82
|
+
query: str,
|
83
|
+
options: JdbcReaderOptions | None,
|
84
|
+
) -> DataFrame:
|
85
|
+
table_query = query.replace(":tbl", f"{catalog}.{schema}.{table}")
|
86
|
+
with_clause_pattern = re.compile(r'WITH\s+.*?\)\s*(?=SELECT)', re.IGNORECASE | re.DOTALL)
|
87
|
+
match = with_clause_pattern.search(table_query)
|
88
|
+
if match:
|
89
|
+
prepare_query_string = match.group(0)
|
90
|
+
query = table_query.replace(match.group(0), '')
|
91
|
+
else:
|
92
|
+
query = table_query
|
93
|
+
prepare_query_string = ""
|
94
|
+
try:
|
95
|
+
if options is None:
|
96
|
+
df = self.reader(query, prepare_query_string).load()
|
97
|
+
else:
|
98
|
+
options = self._get_jdbc_reader_options(options)
|
99
|
+
df = self._get_jdbc_reader(table_query, self.get_jdbc_url, self._DRIVER).options(**options).load()
|
100
|
+
return df.select([col(column).alias(column.lower()) for column in df.columns])
|
101
|
+
except (RuntimeError, PySparkException) as e:
|
102
|
+
return self.log_and_throw_exception(e, "data", table_query)
|
103
|
+
|
104
|
+
def get_schema(
|
105
|
+
self,
|
106
|
+
catalog: str | None,
|
107
|
+
schema: str,
|
108
|
+
table: str,
|
109
|
+
) -> list[Schema]:
|
110
|
+
"""
|
111
|
+
Fetch the Schema from the INFORMATION_SCHEMA.COLUMNS table in SQL Server.
|
112
|
+
|
113
|
+
If the user's current role does not have the necessary privileges to access the specified
|
114
|
+
Information Schema object, RunTimeError will be raised:
|
115
|
+
"SQL access control error: Insufficient privileges to operate on schema 'INFORMATION_SCHEMA' "
|
116
|
+
"""
|
117
|
+
schema_query = re.sub(
|
118
|
+
r'\s+',
|
119
|
+
' ',
|
120
|
+
_SCHEMA_QUERY.format(catalog=catalog, schema=schema, table=table),
|
121
|
+
)
|
122
|
+
try:
|
123
|
+
logger.debug(f"Fetching schema using query: \n`{schema_query}`")
|
124
|
+
logger.info(f"Fetching Schema: Started at: {datetime.now()}")
|
125
|
+
schema_metadata = self.reader(schema_query).load().collect()
|
126
|
+
logger.info(f"Schema fetched successfully. Completed at: {datetime.now()}")
|
127
|
+
return [Schema(field.COLUMN_NAME.lower(), field.DATA_TYPE.lower()) for field in schema_metadata]
|
128
|
+
except (RuntimeError, PySparkException) as e:
|
129
|
+
return self.log_and_throw_exception(e, "schema", schema_query)
|
130
|
+
|
131
|
+
def reader(self, query: str, prepare_query_str="") -> DataFrameReader:
|
132
|
+
return self._get_jdbc_reader(query, self.get_jdbc_url, self._DRIVER, prepare_query_str)
|