databricks-labs-lakebridge 0.10.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (171) hide show
  1. databricks/__init__.py +3 -0
  2. databricks/labs/__init__.py +3 -0
  3. databricks/labs/lakebridge/__about__.py +2 -0
  4. databricks/labs/lakebridge/__init__.py +11 -0
  5. databricks/labs/lakebridge/assessments/configure_assessment.py +194 -0
  6. databricks/labs/lakebridge/assessments/pipeline.py +188 -0
  7. databricks/labs/lakebridge/assessments/profiler_config.py +30 -0
  8. databricks/labs/lakebridge/base_install.py +12 -0
  9. databricks/labs/lakebridge/cli.py +449 -0
  10. databricks/labs/lakebridge/config.py +192 -0
  11. databricks/labs/lakebridge/connections/__init__.py +0 -0
  12. databricks/labs/lakebridge/connections/credential_manager.py +89 -0
  13. databricks/labs/lakebridge/connections/database_manager.py +98 -0
  14. databricks/labs/lakebridge/connections/env_getter.py +13 -0
  15. databricks/labs/lakebridge/contexts/__init__.py +0 -0
  16. databricks/labs/lakebridge/contexts/application.py +133 -0
  17. databricks/labs/lakebridge/coverage/__init__.py +0 -0
  18. databricks/labs/lakebridge/coverage/commons.py +223 -0
  19. databricks/labs/lakebridge/coverage/lakebridge_snow_transpilation_coverage.py +29 -0
  20. databricks/labs/lakebridge/coverage/local_report.py +9 -0
  21. databricks/labs/lakebridge/coverage/sqlglot_snow_transpilation_coverage.py +5 -0
  22. databricks/labs/lakebridge/coverage/sqlglot_tsql_transpilation_coverage.py +5 -0
  23. databricks/labs/lakebridge/deployment/__init__.py +0 -0
  24. databricks/labs/lakebridge/deployment/configurator.py +199 -0
  25. databricks/labs/lakebridge/deployment/dashboard.py +140 -0
  26. databricks/labs/lakebridge/deployment/installation.py +125 -0
  27. databricks/labs/lakebridge/deployment/job.py +147 -0
  28. databricks/labs/lakebridge/deployment/recon.py +145 -0
  29. databricks/labs/lakebridge/deployment/table.py +30 -0
  30. databricks/labs/lakebridge/deployment/upgrade_common.py +124 -0
  31. databricks/labs/lakebridge/discovery/table.py +36 -0
  32. databricks/labs/lakebridge/discovery/table_definition.py +23 -0
  33. databricks/labs/lakebridge/discovery/tsql_table_definition.py +185 -0
  34. databricks/labs/lakebridge/errors/exceptions.py +1 -0
  35. databricks/labs/lakebridge/helpers/__init__.py +0 -0
  36. databricks/labs/lakebridge/helpers/db_sql.py +24 -0
  37. databricks/labs/lakebridge/helpers/execution_time.py +20 -0
  38. databricks/labs/lakebridge/helpers/file_utils.py +64 -0
  39. databricks/labs/lakebridge/helpers/metastore.py +164 -0
  40. databricks/labs/lakebridge/helpers/recon_config_utils.py +176 -0
  41. databricks/labs/lakebridge/helpers/string_utils.py +62 -0
  42. databricks/labs/lakebridge/helpers/telemetry_utils.py +13 -0
  43. databricks/labs/lakebridge/helpers/validation.py +101 -0
  44. databricks/labs/lakebridge/install.py +849 -0
  45. databricks/labs/lakebridge/intermediate/__init__.py +0 -0
  46. databricks/labs/lakebridge/intermediate/dag.py +88 -0
  47. databricks/labs/lakebridge/intermediate/engine_adapter.py +0 -0
  48. databricks/labs/lakebridge/intermediate/root_tables.py +44 -0
  49. databricks/labs/lakebridge/jvmproxy.py +56 -0
  50. databricks/labs/lakebridge/lineage.py +42 -0
  51. databricks/labs/lakebridge/reconcile/__init__.py +0 -0
  52. databricks/labs/lakebridge/reconcile/compare.py +414 -0
  53. databricks/labs/lakebridge/reconcile/connectors/__init__.py +0 -0
  54. databricks/labs/lakebridge/reconcile/connectors/data_source.py +72 -0
  55. databricks/labs/lakebridge/reconcile/connectors/databricks.py +87 -0
  56. databricks/labs/lakebridge/reconcile/connectors/jdbc_reader.py +41 -0
  57. databricks/labs/lakebridge/reconcile/connectors/oracle.py +108 -0
  58. databricks/labs/lakebridge/reconcile/connectors/secrets.py +30 -0
  59. databricks/labs/lakebridge/reconcile/connectors/snowflake.py +173 -0
  60. databricks/labs/lakebridge/reconcile/connectors/source_adapter.py +30 -0
  61. databricks/labs/lakebridge/reconcile/connectors/sql_server.py +132 -0
  62. databricks/labs/lakebridge/reconcile/constants.py +37 -0
  63. databricks/labs/lakebridge/reconcile/exception.py +42 -0
  64. databricks/labs/lakebridge/reconcile/execute.py +920 -0
  65. databricks/labs/lakebridge/reconcile/query_builder/__init__.py +0 -0
  66. databricks/labs/lakebridge/reconcile/query_builder/aggregate_query.py +293 -0
  67. databricks/labs/lakebridge/reconcile/query_builder/base.py +138 -0
  68. databricks/labs/lakebridge/reconcile/query_builder/count_query.py +33 -0
  69. databricks/labs/lakebridge/reconcile/query_builder/expression_generator.py +292 -0
  70. databricks/labs/lakebridge/reconcile/query_builder/hash_query.py +91 -0
  71. databricks/labs/lakebridge/reconcile/query_builder/sampling_query.py +123 -0
  72. databricks/labs/lakebridge/reconcile/query_builder/threshold_query.py +231 -0
  73. databricks/labs/lakebridge/reconcile/recon_capture.py +635 -0
  74. databricks/labs/lakebridge/reconcile/recon_config.py +363 -0
  75. databricks/labs/lakebridge/reconcile/recon_output_config.py +85 -0
  76. databricks/labs/lakebridge/reconcile/runner.py +97 -0
  77. databricks/labs/lakebridge/reconcile/sampler.py +239 -0
  78. databricks/labs/lakebridge/reconcile/schema_compare.py +126 -0
  79. databricks/labs/lakebridge/resources/__init__.py +0 -0
  80. databricks/labs/lakebridge/resources/config/credentials.yml +33 -0
  81. databricks/labs/lakebridge/resources/reconcile/__init__.py +0 -0
  82. databricks/labs/lakebridge/resources/reconcile/dashboards/__init__.py +0 -0
  83. databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/00_0_aggregate_recon_header.md +6 -0
  84. databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/01_0_recon_id.filter.yml +6 -0
  85. databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/01_1_executed_by.filter.yml +5 -0
  86. databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/01_2_started_at.filter.yml +5 -0
  87. databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/02_0_source_type.filter.yml +5 -0
  88. databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/02_1_source_table.filter.yml +5 -0
  89. databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/02_2_target_table.filter.yml +5 -0
  90. databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/04_0_aggregate_summary_table.sql +46 -0
  91. databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/05_0_aggregate_recon_drilldown_header.md +2 -0
  92. databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/06_0_recon_id.filter.yml +5 -0
  93. databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/06_1_category.filter.yml +5 -0
  94. databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/06_2_aggregate_type.filter.yml +5 -0
  95. databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/07_0_target_table.filter.yml +4 -0
  96. databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/07_1_source_table.filter.yml +4 -0
  97. databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/08_0_aggregate_details_table.sql +92 -0
  98. databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/09_0_aggregate_missing_mismatch_header.md +1 -0
  99. databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/10_0_aggr_mismatched_records.sql +19 -0
  100. databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/11_0_aggr_missing_in_databricks.sql +19 -0
  101. databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/11_1_aggr_missing_in_source.sql +19 -0
  102. databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/dashboard.yml +365 -0
  103. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/00_0_recon_main.md +3 -0
  104. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/01_0_recon_id.filter.yml +6 -0
  105. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/01_1_report_type.filter.yml +5 -0
  106. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/01_2_executed_by.filter.yml +5 -0
  107. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/02_0_source_type.filter.yml +5 -0
  108. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/02_1_source_table.filter.yml +6 -0
  109. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/02_2_target_table.filter.yml +6 -0
  110. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/03_0_started_at.filter.yml +5 -0
  111. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/05_0_summary_table.sql +38 -0
  112. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/06_0_schema_comparison_header.md +3 -0
  113. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/07_0_schema_details_table.sql +42 -0
  114. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/08_0_drill_down_header.md +3 -0
  115. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/09_0_recon_id.filter.yml +4 -0
  116. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/09_1_category.filter.yml +4 -0
  117. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/10_0_target_table.filter.yml +4 -0
  118. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/10_1_source_table.filter.yml +4 -0
  119. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/11_0_recon_details_pivot.sql +40 -0
  120. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/12_0_daily_data_validation_issue_header.md +3 -0
  121. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/13_0_success_fail_.filter.yml +4 -0
  122. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/14_0_failed_recon_ids.sql +15 -0
  123. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/15_0_total_failed_runs.sql +10 -0
  124. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/15_1_failed_targets.sql +10 -0
  125. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/15_2_successful_targets.sql +10 -0
  126. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/16_0_missing_mismatch_header.md +1 -0
  127. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/17_0_mismatched_records.sql +14 -0
  128. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/17_1_threshold_mismatches.sql +14 -0
  129. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/18_0_missing_in_databricks.sql +14 -0
  130. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/18_1_missing_in_source.sql +14 -0
  131. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/dashboard.yml +545 -0
  132. databricks/labs/lakebridge/resources/reconcile/queries/__init__.py +0 -0
  133. databricks/labs/lakebridge/resources/reconcile/queries/installation/__init__.py +0 -0
  134. databricks/labs/lakebridge/resources/reconcile/queries/installation/aggregate_details.sql +7 -0
  135. databricks/labs/lakebridge/resources/reconcile/queries/installation/aggregate_metrics.sql +15 -0
  136. databricks/labs/lakebridge/resources/reconcile/queries/installation/aggregate_rules.sql +6 -0
  137. databricks/labs/lakebridge/resources/reconcile/queries/installation/details.sql +7 -0
  138. databricks/labs/lakebridge/resources/reconcile/queries/installation/main.sql +24 -0
  139. databricks/labs/lakebridge/resources/reconcile/queries/installation/metrics.sql +21 -0
  140. databricks/labs/lakebridge/transpiler/__init__.py +0 -0
  141. databricks/labs/lakebridge/transpiler/execute.py +423 -0
  142. databricks/labs/lakebridge/transpiler/lsp/__init__.py +0 -0
  143. databricks/labs/lakebridge/transpiler/lsp/lsp_engine.py +564 -0
  144. databricks/labs/lakebridge/transpiler/sqlglot/__init__.py +0 -0
  145. databricks/labs/lakebridge/transpiler/sqlglot/dialect_utils.py +30 -0
  146. databricks/labs/lakebridge/transpiler/sqlglot/generator/__init__.py +0 -0
  147. databricks/labs/lakebridge/transpiler/sqlglot/generator/databricks.py +771 -0
  148. databricks/labs/lakebridge/transpiler/sqlglot/lca_utils.py +138 -0
  149. databricks/labs/lakebridge/transpiler/sqlglot/local_expression.py +197 -0
  150. databricks/labs/lakebridge/transpiler/sqlglot/parsers/__init__.py +0 -0
  151. databricks/labs/lakebridge/transpiler/sqlglot/parsers/oracle.py +23 -0
  152. databricks/labs/lakebridge/transpiler/sqlglot/parsers/presto.py +202 -0
  153. databricks/labs/lakebridge/transpiler/sqlglot/parsers/snowflake.py +535 -0
  154. databricks/labs/lakebridge/transpiler/sqlglot/sqlglot_engine.py +203 -0
  155. databricks/labs/lakebridge/transpiler/transpile_engine.py +49 -0
  156. databricks/labs/lakebridge/transpiler/transpile_status.py +68 -0
  157. databricks/labs/lakebridge/uninstall.py +28 -0
  158. databricks/labs/lakebridge/upgrades/v0.4.0_add_main_table_operation_name_column.py +80 -0
  159. databricks/labs/lakebridge/upgrades/v0.6.0_alter_metrics_datatype.py +51 -0
  160. databricks_labs_lakebridge-0.10.0.dist-info/METADATA +58 -0
  161. databricks_labs_lakebridge-0.10.0.dist-info/RECORD +171 -0
  162. databricks_labs_lakebridge-0.10.0.dist-info/WHEEL +4 -0
  163. databricks_labs_lakebridge-0.10.0.dist-info/entry_points.txt +2 -0
  164. databricks_labs_lakebridge-0.10.0.dist-info/licenses/LICENSE +69 -0
  165. databricks_labs_lakebridge-0.10.0.dist-info/licenses/NOTICE +42 -0
  166. docs/lakebridge/src/components/Button.tsx +81 -0
  167. docs/lakebridge/src/css/custom.css +167 -0
  168. docs/lakebridge/src/css/table.css +20 -0
  169. docs/lakebridge/src/pages/index.tsx +57 -0
  170. docs/lakebridge/src/theme/Footer/index.tsx +24 -0
  171. docs/lakebridge/src/theme/Layout/index.tsx +18 -0
@@ -0,0 +1,72 @@
1
+ import logging
2
+ from abc import ABC, abstractmethod
3
+
4
+ from pyspark.sql import DataFrame
5
+
6
+ from databricks.labs.lakebridge.reconcile.exception import DataSourceRuntimeException
7
+ from databricks.labs.lakebridge.reconcile.recon_config import JdbcReaderOptions, Schema
8
+
9
+ logger = logging.getLogger(__name__)
10
+
11
+
12
+ class DataSource(ABC):
13
+
14
+ @abstractmethod
15
+ def read_data(
16
+ self,
17
+ catalog: str | None,
18
+ schema: str,
19
+ table: str,
20
+ query: str,
21
+ options: JdbcReaderOptions | None,
22
+ ) -> DataFrame:
23
+ return NotImplemented
24
+
25
+ @abstractmethod
26
+ def get_schema(
27
+ self,
28
+ catalog: str | None,
29
+ schema: str,
30
+ table: str,
31
+ ) -> list[Schema]:
32
+ return NotImplemented
33
+
34
+ @classmethod
35
+ def log_and_throw_exception(cls, exception: Exception, fetch_type: str, query: str):
36
+ error_msg = f"Runtime exception occurred while fetching {fetch_type} using {query} : {exception}"
37
+ logger.warning(error_msg)
38
+ raise DataSourceRuntimeException(error_msg) from exception
39
+
40
+
41
+ class MockDataSource(DataSource):
42
+
43
+ def __init__(
44
+ self,
45
+ dataframe_repository: dict[tuple[str, str, str], DataFrame],
46
+ schema_repository: dict[tuple[str, str, str], list[Schema]],
47
+ exception: Exception = RuntimeError("Mock Exception"),
48
+ ):
49
+ self._dataframe_repository: dict[tuple[str, str, str], DataFrame] = dataframe_repository
50
+ self._schema_repository: dict[tuple[str, str, str], list[Schema]] = schema_repository
51
+ self._exception = exception
52
+
53
+ def read_data(
54
+ self,
55
+ catalog: str | None,
56
+ schema: str,
57
+ table: str,
58
+ query: str,
59
+ options: JdbcReaderOptions | None,
60
+ ) -> DataFrame:
61
+ catalog_str = catalog if catalog else ""
62
+ mock_df = self._dataframe_repository.get((catalog_str, schema, query))
63
+ if not mock_df:
64
+ return self.log_and_throw_exception(self._exception, "data", f"({catalog}, {schema}, {query})")
65
+ return mock_df
66
+
67
+ def get_schema(self, catalog: str | None, schema: str, table: str) -> list[Schema]:
68
+ catalog_str = catalog if catalog else ""
69
+ mock_schema = self._schema_repository.get((catalog_str, schema, table))
70
+ if not mock_schema:
71
+ return self.log_and_throw_exception(self._exception, "schema", f"({catalog}, {schema}, {table})")
72
+ return mock_schema
@@ -0,0 +1,87 @@
1
+ import logging
2
+ import re
3
+ from datetime import datetime
4
+
5
+ from pyspark.errors import PySparkException
6
+ from pyspark.sql import DataFrame, SparkSession
7
+ from pyspark.sql.functions import col
8
+ from sqlglot import Dialect
9
+
10
+ from databricks.labs.lakebridge.reconcile.connectors.data_source import DataSource
11
+ from databricks.labs.lakebridge.reconcile.connectors.secrets import SecretsMixin
12
+ from databricks.labs.lakebridge.reconcile.recon_config import JdbcReaderOptions, Schema
13
+ from databricks.sdk import WorkspaceClient
14
+
15
+ logger = logging.getLogger(__name__)
16
+
17
+
18
+ def _get_schema_query(catalog: str, schema: str, table: str):
19
+ # TODO: Ensure that the target_catalog in the configuration is not set to "hive_metastore". The source_catalog
20
+ # can only be set to "hive_metastore" if the source type is "databricks".
21
+ if schema == "global_temp":
22
+ return f"describe table global_temp.{table}"
23
+ if catalog == "hive_metastore":
24
+ return f"describe table {catalog}.{schema}.{table}"
25
+
26
+ query = f"""select
27
+ lower(column_name) as col_name,
28
+ full_data_type as data_type
29
+ from {catalog}.information_schema.columns
30
+ where lower(table_catalog)='{catalog}'
31
+ and lower(table_schema)='{schema}'
32
+ and lower(table_name) ='{table}'
33
+ order by col_name"""
34
+ return re.sub(r'\s+', ' ', query)
35
+
36
+
37
+ class DatabricksDataSource(DataSource, SecretsMixin):
38
+
39
+ def __init__(
40
+ self,
41
+ engine: Dialect,
42
+ spark: SparkSession,
43
+ ws: WorkspaceClient,
44
+ secret_scope: str,
45
+ ):
46
+ self._engine = engine
47
+ self._spark = spark
48
+ self._ws = ws
49
+ self._secret_scope = secret_scope
50
+
51
+ def read_data(
52
+ self,
53
+ catalog: str | None,
54
+ schema: str,
55
+ table: str,
56
+ query: str,
57
+ options: JdbcReaderOptions | None,
58
+ ) -> DataFrame:
59
+ namespace_catalog = "hive_metastore" if not catalog else catalog
60
+ if schema == "global_temp":
61
+ namespace_catalog = "global_temp"
62
+ else:
63
+ namespace_catalog = f"{namespace_catalog}.{schema}"
64
+ table_with_namespace = f"{namespace_catalog}.{table}"
65
+ table_query = query.replace(":tbl", table_with_namespace)
66
+ try:
67
+ df = self._spark.sql(table_query)
68
+ return df.select([col(column).alias(column.lower()) for column in df.columns])
69
+ except (RuntimeError, PySparkException) as e:
70
+ return self.log_and_throw_exception(e, "data", table_query)
71
+
72
+ def get_schema(
73
+ self,
74
+ catalog: str | None,
75
+ schema: str,
76
+ table: str,
77
+ ) -> list[Schema]:
78
+ catalog_str = catalog if catalog else "hive_metastore"
79
+ schema_query = _get_schema_query(catalog_str, schema, table)
80
+ try:
81
+ logger.debug(f"Fetching schema using query: \n`{schema_query}`")
82
+ logger.info(f"Fetching Schema: Started at: {datetime.now()}")
83
+ schema_metadata = self._spark.sql(schema_query).where("col_name not like '#%'").distinct().collect()
84
+ logger.info(f"Schema fetched successfully. Completed at: {datetime.now()}")
85
+ return [Schema(field.col_name.lower(), field.data_type.lower()) for field in schema_metadata]
86
+ except (RuntimeError, PySparkException) as e:
87
+ return self.log_and_throw_exception(e, "schema", schema_query)
@@ -0,0 +1,41 @@
1
+ from typing import Any
2
+
3
+ from pyspark.sql import SparkSession
4
+
5
+ from databricks.labs.lakebridge.reconcile.recon_config import JdbcReaderOptions
6
+
7
+
8
+ class JDBCReaderMixin:
9
+ _spark: SparkSession
10
+
11
+ # TODO update the url
12
+ def _get_jdbc_reader(self, query, jdbc_url, driver, prepare_query=None):
13
+ driver_class = {
14
+ "oracle": "oracle.jdbc.driver.OracleDriver",
15
+ "snowflake": "net.snowflake.client.jdbc.SnowflakeDriver",
16
+ "sqlserver": "com.microsoft.sqlserver.jdbc.SQLServerDriver",
17
+ }
18
+ reader = (
19
+ self._spark.read.format("jdbc")
20
+ .option("url", jdbc_url)
21
+ .option("driver", driver_class.get(driver, driver))
22
+ .option("dbtable", f"({query}) tmp")
23
+ )
24
+ if prepare_query is not None:
25
+ reader = reader.option('prepareQuery', prepare_query)
26
+ return reader
27
+
28
+ @staticmethod
29
+ def _get_jdbc_reader_options(options: JdbcReaderOptions):
30
+ option_dict: dict[str, Any] = {}
31
+ if options.number_partitions:
32
+ option_dict["numPartitions"] = options.number_partitions
33
+ if options.partition_column:
34
+ option_dict["partitionColumn"] = options.partition_column
35
+ if options.lower_bound:
36
+ option_dict["lowerBound"] = options.lower_bound
37
+ if options.upper_bound:
38
+ option_dict["upperBound"] = options.upper_bound
39
+ if options.fetch_size:
40
+ option_dict["fetchsize"] = options.fetch_size
41
+ return option_dict
@@ -0,0 +1,108 @@
1
+ import re
2
+ import logging
3
+ from datetime import datetime
4
+
5
+ from pyspark.errors import PySparkException
6
+ from pyspark.sql import DataFrame, DataFrameReader, SparkSession
7
+ from pyspark.sql.functions import col
8
+ from sqlglot import Dialect
9
+
10
+ from databricks.labs.lakebridge.reconcile.connectors.data_source import DataSource
11
+ from databricks.labs.lakebridge.reconcile.connectors.jdbc_reader import JDBCReaderMixin
12
+ from databricks.labs.lakebridge.reconcile.connectors.secrets import SecretsMixin
13
+ from databricks.labs.lakebridge.reconcile.recon_config import JdbcReaderOptions, Schema
14
+ from databricks.sdk import WorkspaceClient
15
+
16
+ logger = logging.getLogger(__name__)
17
+
18
+
19
+ class OracleDataSource(DataSource, SecretsMixin, JDBCReaderMixin):
20
+ _DRIVER = "oracle"
21
+ _SCHEMA_QUERY = """select column_name, case when (data_precision is not null
22
+ and data_scale <> 0)
23
+ then data_type || '(' || data_precision || ',' || data_scale || ')'
24
+ when (data_precision is not null and data_scale = 0)
25
+ then data_type || '(' || data_precision || ')'
26
+ when data_precision is null and (lower(data_type) in ('date') or
27
+ lower(data_type) like 'timestamp%') then data_type
28
+ when CHAR_LENGTH = 0 then data_type
29
+ else data_type || '(' || CHAR_LENGTH || ')'
30
+ end data_type
31
+ FROM ALL_TAB_COLUMNS
32
+ WHERE lower(TABLE_NAME) = '{table}' and lower(owner) = '{owner}'"""
33
+
34
+ def __init__(
35
+ self,
36
+ engine: Dialect,
37
+ spark: SparkSession,
38
+ ws: WorkspaceClient,
39
+ secret_scope: str,
40
+ ):
41
+ self._engine = engine
42
+ self._spark = spark
43
+ self._ws = ws
44
+ self._secret_scope = secret_scope
45
+
46
+ @property
47
+ def get_jdbc_url(self) -> str:
48
+ return (
49
+ f"jdbc:{OracleDataSource._DRIVER}:thin:{self._get_secret('user')}"
50
+ f"/{self._get_secret('password')}@//{self._get_secret('host')}"
51
+ f":{self._get_secret('port')}/{self._get_secret('database')}"
52
+ )
53
+
54
+ def read_data(
55
+ self,
56
+ catalog: str | None,
57
+ schema: str,
58
+ table: str,
59
+ query: str,
60
+ options: JdbcReaderOptions | None,
61
+ ) -> DataFrame:
62
+ table_query = query.replace(":tbl", f"{schema}.{table}")
63
+ try:
64
+ if options is None:
65
+ return self.reader(table_query).options(**self._get_timestamp_options()).load()
66
+ reader_options = self._get_jdbc_reader_options(options) | self._get_timestamp_options()
67
+ df = self.reader(table_query).options(**reader_options).load()
68
+ logger.warning(f"Fetching data using query: \n`{table_query}`")
69
+
70
+ # Convert all column names to lower case
71
+ df = df.select([col(c).alias(c.lower()) for c in df.columns])
72
+ return df
73
+ except (RuntimeError, PySparkException) as e:
74
+ return self.log_and_throw_exception(e, "data", table_query)
75
+
76
+ def get_schema(
77
+ self,
78
+ catalog: str | None,
79
+ schema: str,
80
+ table: str,
81
+ ) -> list[Schema]:
82
+ schema_query = re.sub(
83
+ r'\s+',
84
+ ' ',
85
+ OracleDataSource._SCHEMA_QUERY.format(table=table, owner=schema),
86
+ )
87
+ try:
88
+ logger.debug(f"Fetching schema using query: \n`{schema_query}`")
89
+ logger.info(f"Fetching Schema: Started at: {datetime.now()}")
90
+ df = self.reader(schema_query).load()
91
+ schema_metadata = df.select([col(c).alias(c.lower()) for c in df.columns]).collect()
92
+ logger.info(f"Schema fetched successfully. Completed at: {datetime.now()}")
93
+ logger.debug(f"schema_metadata: ${schema_metadata}")
94
+ return [Schema(field.column_name.lower(), field.data_type.lower()) for field in schema_metadata]
95
+ except (RuntimeError, PySparkException) as e:
96
+ return self.log_and_throw_exception(e, "schema", schema_query)
97
+
98
+ @staticmethod
99
+ def _get_timestamp_options() -> dict[str, str]:
100
+ return {
101
+ "oracle.jdbc.mapDateToTimestamp": "False",
102
+ "sessionInitStatement": "BEGIN dbms_session.set_nls('nls_date_format', "
103
+ "'''YYYY-MM-DD''');dbms_session.set_nls('nls_timestamp_format', '''YYYY-MM-DD "
104
+ "HH24:MI:SS''');END;",
105
+ }
106
+
107
+ def reader(self, query: str) -> DataFrameReader:
108
+ return self._get_jdbc_reader(query, self.get_jdbc_url, OracleDataSource._DRIVER)
@@ -0,0 +1,30 @@
1
+ import base64
2
+ import logging
3
+
4
+ from databricks.sdk import WorkspaceClient
5
+ from databricks.sdk.errors import NotFound
6
+
7
+ logger = logging.getLogger(__name__)
8
+
9
+
10
+ class SecretsMixin:
11
+ _ws: WorkspaceClient
12
+ _secret_scope: str
13
+
14
+ def _get_secret(self, secret_key: str) -> str:
15
+ """Get the secret value given a secret scope & secret key. Log a warning if secret does not exist"""
16
+ try:
17
+ # Return the decoded secret value in string format
18
+ secret = self._ws.secrets.get_secret(self._secret_scope, secret_key)
19
+ assert secret.value is not None
20
+ return base64.b64decode(secret.value).decode("utf-8")
21
+ except NotFound as e:
22
+ raise NotFound(f'Secret does not exist with scope: {self._secret_scope} and key: {secret_key} : {e}') from e
23
+ except UnicodeDecodeError as e:
24
+ raise UnicodeDecodeError(
25
+ "utf-8",
26
+ secret_key.encode(),
27
+ 0,
28
+ 1,
29
+ f"Secret {self._secret_scope}/{secret_key} has Base64 bytes that cannot be decoded to utf-8 string: {e}.",
30
+ ) from e
@@ -0,0 +1,173 @@
1
+ import logging
2
+ import re
3
+ from datetime import datetime
4
+
5
+ from pyspark.errors import PySparkException
6
+ from pyspark.sql import DataFrame, DataFrameReader, SparkSession
7
+ from pyspark.sql.functions import col
8
+ from sqlglot import Dialect
9
+ from cryptography.hazmat.backends import default_backend
10
+ from cryptography.hazmat.primitives import serialization
11
+
12
+ from databricks.labs.lakebridge.reconcile.connectors.data_source import DataSource
13
+ from databricks.labs.lakebridge.reconcile.connectors.jdbc_reader import JDBCReaderMixin
14
+ from databricks.labs.lakebridge.reconcile.connectors.secrets import SecretsMixin
15
+ from databricks.labs.lakebridge.reconcile.exception import InvalidSnowflakePemPrivateKey
16
+ from databricks.labs.lakebridge.reconcile.recon_config import JdbcReaderOptions, Schema
17
+ from databricks.sdk import WorkspaceClient
18
+ from databricks.sdk.errors import NotFound
19
+
20
+ logger = logging.getLogger(__name__)
21
+
22
+
23
+ class SnowflakeDataSource(DataSource, SecretsMixin, JDBCReaderMixin):
24
+ _DRIVER = "snowflake"
25
+ """
26
+ * INFORMATION_SCHEMA:
27
+ - see https://docs.snowflake.com/en/sql-reference/info-schema#considerations-for-replacing-show-commands-with-information-schema-views
28
+ * DATA:
29
+ - only unquoted identifiers are treated as case-insensitive and are stored in uppercase.
30
+ - for quoted identifiers refer:
31
+ https://docs.snowflake.com/en/sql-reference/identifiers-syntax#double-quoted-identifiers
32
+ * ORDINAL_POSITION:
33
+ - indicates the sequential order of a column within a table or view,
34
+ starting from 1 based on the order of column definition.
35
+ """
36
+ _SCHEMA_QUERY = """select column_name,
37
+ case
38
+ when numeric_precision is not null and numeric_scale is not null
39
+ then
40
+ concat(data_type, '(', numeric_precision, ',' , numeric_scale, ')')
41
+ when lower(data_type) = 'text'
42
+ then
43
+ concat('varchar', '(', CHARACTER_MAXIMUM_LENGTH, ')')
44
+ else data_type
45
+ end as data_type
46
+ from {catalog}.INFORMATION_SCHEMA.COLUMNS
47
+ where lower(table_name)='{table}' and table_schema = '{schema}'
48
+ order by ordinal_position"""
49
+
50
+ def __init__(
51
+ self,
52
+ engine: Dialect,
53
+ spark: SparkSession,
54
+ ws: WorkspaceClient,
55
+ secret_scope: str,
56
+ ):
57
+ self._engine = engine
58
+ self._spark = spark
59
+ self._ws = ws
60
+ self._secret_scope = secret_scope
61
+
62
+ @property
63
+ def get_jdbc_url(self) -> str:
64
+ try:
65
+ sf_password = self._get_secret('sfPassword')
66
+ except (NotFound, KeyError) as e:
67
+ message = "sfPassword is mandatory for jdbc connectivity with Snowflake."
68
+ logger.error(message)
69
+ raise NotFound(message) from e
70
+
71
+ return (
72
+ f"jdbc:{SnowflakeDataSource._DRIVER}://{self._get_secret('sfAccount')}.snowflakecomputing.com"
73
+ f"/?user={self._get_secret('sfUser')}&password={sf_password}"
74
+ f"&db={self._get_secret('sfDatabase')}&schema={self._get_secret('sfSchema')}"
75
+ f"&warehouse={self._get_secret('sfWarehouse')}&role={self._get_secret('sfRole')}"
76
+ )
77
+
78
+ @staticmethod
79
+ def get_private_key(pem_private_key: str) -> str:
80
+ try:
81
+ private_key_bytes = pem_private_key.encode("UTF-8")
82
+ p_key = serialization.load_pem_private_key(
83
+ private_key_bytes,
84
+ password=None,
85
+ backend=default_backend(),
86
+ )
87
+ pkb = p_key.private_bytes(
88
+ encoding=serialization.Encoding.PEM,
89
+ format=serialization.PrivateFormat.PKCS8,
90
+ encryption_algorithm=serialization.NoEncryption(),
91
+ )
92
+ pkb_str = pkb.decode("UTF-8")
93
+ # Remove the first and last lines (BEGIN/END markers)
94
+ private_key_pem_lines = pkb_str.strip().split('\n')[1:-1]
95
+ # Join the lines to form the base64 encoded string
96
+ private_key_pem_str = ''.join(private_key_pem_lines)
97
+ return private_key_pem_str
98
+ except Exception as e:
99
+ message = f"Failed to load or process the provided PEM private key. --> {e}"
100
+ logger.error(message)
101
+ raise InvalidSnowflakePemPrivateKey(message) from e
102
+
103
+ def read_data(
104
+ self,
105
+ catalog: str | None,
106
+ schema: str,
107
+ table: str,
108
+ query: str,
109
+ options: JdbcReaderOptions | None,
110
+ ) -> DataFrame:
111
+ table_query = query.replace(":tbl", f"{catalog}.{schema}.{table}")
112
+ try:
113
+ if options is None:
114
+ df = self.reader(table_query).load()
115
+ else:
116
+ options = self._get_jdbc_reader_options(options)
117
+ df = (
118
+ self._get_jdbc_reader(table_query, self.get_jdbc_url, SnowflakeDataSource._DRIVER)
119
+ .options(**options)
120
+ .load()
121
+ )
122
+ return df.select([col(column).alias(column.lower()) for column in df.columns])
123
+ except (RuntimeError, PySparkException) as e:
124
+ return self.log_and_throw_exception(e, "data", table_query)
125
+
126
+ def get_schema(
127
+ self,
128
+ catalog: str | None,
129
+ schema: str,
130
+ table: str,
131
+ ) -> list[Schema]:
132
+ """
133
+ Fetch the Schema from the INFORMATION_SCHEMA.COLUMNS table in Snowflake.
134
+
135
+ If the user's current role does not have the necessary privileges to access the specified
136
+ Information Schema object, RunTimeError will be raised:
137
+ "SQL access control error: Insufficient privileges to operate on schema 'INFORMATION_SCHEMA' "
138
+ """
139
+ schema_query = re.sub(
140
+ r'\s+',
141
+ ' ',
142
+ SnowflakeDataSource._SCHEMA_QUERY.format(catalog=catalog, schema=schema.upper(), table=table),
143
+ )
144
+ try:
145
+ logger.debug(f"Fetching schema using query: \n`{schema_query}`")
146
+ logger.info(f"Fetching Schema: Started at: {datetime.now()}")
147
+ schema_metadata = self.reader(schema_query).load().collect()
148
+ logger.info(f"Schema fetched successfully. Completed at: {datetime.now()}")
149
+ return [Schema(field.COLUMN_NAME.lower(), field.DATA_TYPE.lower()) for field in schema_metadata]
150
+ except (RuntimeError, PySparkException) as e:
151
+ return self.log_and_throw_exception(e, "schema", schema_query)
152
+
153
+ def reader(self, query: str) -> DataFrameReader:
154
+ options = {
155
+ "sfUrl": self._get_secret('sfUrl'),
156
+ "sfUser": self._get_secret('sfUser'),
157
+ "sfDatabase": self._get_secret('sfDatabase'),
158
+ "sfSchema": self._get_secret('sfSchema'),
159
+ "sfWarehouse": self._get_secret('sfWarehouse'),
160
+ "sfRole": self._get_secret('sfRole'),
161
+ }
162
+ try:
163
+ options["pem_private_key"] = SnowflakeDataSource.get_private_key(self._get_secret('pem_private_key'))
164
+ except (NotFound, KeyError):
165
+ logger.warning("pem_private_key not found. Checking for sfPassword")
166
+ try:
167
+ options["sfPassword"] = self._get_secret('sfPassword')
168
+ except (NotFound, KeyError) as e:
169
+ message = "sfPassword and pem_private_key not found. Either one is required for snowflake auth."
170
+ logger.error(message)
171
+ raise NotFound(message) from e
172
+
173
+ return self._spark.read.format("snowflake").option("dbtable", f"({query}) as tmp").options(**options)
@@ -0,0 +1,30 @@
1
+ from pyspark.sql import SparkSession
2
+ from sqlglot import Dialect
3
+ from sqlglot.dialects import TSQL
4
+
5
+ from databricks.labs.lakebridge.reconcile.connectors.data_source import DataSource
6
+ from databricks.labs.lakebridge.reconcile.connectors.databricks import DatabricksDataSource
7
+ from databricks.labs.lakebridge.reconcile.connectors.oracle import OracleDataSource
8
+ from databricks.labs.lakebridge.reconcile.connectors.snowflake import SnowflakeDataSource
9
+ from databricks.labs.lakebridge.reconcile.connectors.sql_server import SQLServerDataSource
10
+ from databricks.labs.lakebridge.transpiler.sqlglot.generator.databricks import Databricks
11
+ from databricks.labs.lakebridge.transpiler.sqlglot.parsers.oracle import Oracle
12
+ from databricks.labs.lakebridge.transpiler.sqlglot.parsers.snowflake import Snowflake
13
+ from databricks.sdk import WorkspaceClient
14
+
15
+
16
+ def create_adapter(
17
+ engine: Dialect,
18
+ spark: SparkSession,
19
+ ws: WorkspaceClient,
20
+ secret_scope: str,
21
+ ) -> DataSource:
22
+ if isinstance(engine, Snowflake):
23
+ return SnowflakeDataSource(engine, spark, ws, secret_scope)
24
+ if isinstance(engine, Oracle):
25
+ return OracleDataSource(engine, spark, ws, secret_scope)
26
+ if isinstance(engine, Databricks):
27
+ return DatabricksDataSource(engine, spark, ws, secret_scope)
28
+ if isinstance(engine, TSQL):
29
+ return SQLServerDataSource(engine, spark, ws, secret_scope)
30
+ raise ValueError(f"Unsupported source type --> {engine}")
@@ -0,0 +1,132 @@
1
+ import re
2
+ import logging
3
+ from datetime import datetime
4
+
5
+ from pyspark.errors import PySparkException
6
+ from pyspark.sql import DataFrame, DataFrameReader, SparkSession
7
+ from pyspark.sql.functions import col
8
+ from sqlglot import Dialect
9
+
10
+ from databricks.labs.lakebridge.reconcile.connectors.data_source import DataSource
11
+ from databricks.labs.lakebridge.reconcile.connectors.jdbc_reader import JDBCReaderMixin
12
+ from databricks.labs.lakebridge.reconcile.connectors.secrets import SecretsMixin
13
+ from databricks.labs.lakebridge.reconcile.recon_config import JdbcReaderOptions, Schema
14
+ from databricks.sdk import WorkspaceClient
15
+
16
+ logger = logging.getLogger(__name__)
17
+
18
+ _SCHEMA_QUERY = """SELECT
19
+ COLUMN_NAME,
20
+ CASE
21
+ WHEN DATA_TYPE IN ('int', 'bigint')
22
+ THEN DATA_TYPE
23
+ WHEN DATA_TYPE IN ('smallint', 'tinyint')
24
+ THEN 'smallint'
25
+ WHEN DATA_TYPE IN ('decimal' ,'numeric')
26
+ THEN 'decimal(' +
27
+ CAST(NUMERIC_PRECISION AS VARCHAR) + ',' +
28
+ CAST(NUMERIC_SCALE AS VARCHAR) + ')'
29
+ WHEN DATA_TYPE IN ('float', 'real')
30
+ THEN 'double'
31
+ WHEN CHARACTER_MAXIMUM_LENGTH IS NOT NULL AND DATA_TYPE IN ('varchar','char','text','nchar','nvarchar','ntext')
32
+ THEN DATA_TYPE
33
+ WHEN DATA_TYPE IN ('date','time','datetime', 'datetime2','smalldatetime','datetimeoffset')
34
+ THEN DATA_TYPE
35
+ WHEN DATA_TYPE IN ('bit')
36
+ THEN 'boolean'
37
+ WHEN DATA_TYPE IN ('binary','varbinary')
38
+ THEN 'binary'
39
+ ELSE DATA_TYPE
40
+ END AS 'DATA_TYPE'
41
+ FROM
42
+ INFORMATION_SCHEMA.COLUMNS
43
+ WHERE
44
+ LOWER(TABLE_NAME) = LOWER('{table}')
45
+ AND LOWER(TABLE_SCHEMA) = LOWER('{schema}')
46
+ AND LOWER(TABLE_CATALOG) = LOWER('{catalog}')
47
+ """
48
+
49
+
50
+ class SQLServerDataSource(DataSource, SecretsMixin, JDBCReaderMixin):
51
+ _DRIVER = "sqlserver"
52
+
53
+ def __init__(
54
+ self,
55
+ engine: Dialect,
56
+ spark: SparkSession,
57
+ ws: WorkspaceClient,
58
+ secret_scope: str,
59
+ ):
60
+ self._engine = engine
61
+ self._spark = spark
62
+ self._ws = ws
63
+ self._secret_scope = secret_scope
64
+
65
+ @property
66
+ def get_jdbc_url(self) -> str:
67
+ # Construct the JDBC URL
68
+ return (
69
+ f"jdbc:{self._DRIVER}://{self._get_secret('host')}:{self._get_secret('port')};"
70
+ f"databaseName={self._get_secret('database')};"
71
+ f"user={self._get_secret('user')};"
72
+ f"password={self._get_secret('password')};"
73
+ f"encrypt={self._get_secret('encrypt')};"
74
+ f"trustServerCertificate={self._get_secret('trustServerCertificate')};"
75
+ )
76
+
77
+ def read_data(
78
+ self,
79
+ catalog: str | None,
80
+ schema: str,
81
+ table: str,
82
+ query: str,
83
+ options: JdbcReaderOptions | None,
84
+ ) -> DataFrame:
85
+ table_query = query.replace(":tbl", f"{catalog}.{schema}.{table}")
86
+ with_clause_pattern = re.compile(r'WITH\s+.*?\)\s*(?=SELECT)', re.IGNORECASE | re.DOTALL)
87
+ match = with_clause_pattern.search(table_query)
88
+ if match:
89
+ prepare_query_string = match.group(0)
90
+ query = table_query.replace(match.group(0), '')
91
+ else:
92
+ query = table_query
93
+ prepare_query_string = ""
94
+ try:
95
+ if options is None:
96
+ df = self.reader(query, prepare_query_string).load()
97
+ else:
98
+ options = self._get_jdbc_reader_options(options)
99
+ df = self._get_jdbc_reader(table_query, self.get_jdbc_url, self._DRIVER).options(**options).load()
100
+ return df.select([col(column).alias(column.lower()) for column in df.columns])
101
+ except (RuntimeError, PySparkException) as e:
102
+ return self.log_and_throw_exception(e, "data", table_query)
103
+
104
+ def get_schema(
105
+ self,
106
+ catalog: str | None,
107
+ schema: str,
108
+ table: str,
109
+ ) -> list[Schema]:
110
+ """
111
+ Fetch the Schema from the INFORMATION_SCHEMA.COLUMNS table in SQL Server.
112
+
113
+ If the user's current role does not have the necessary privileges to access the specified
114
+ Information Schema object, RunTimeError will be raised:
115
+ "SQL access control error: Insufficient privileges to operate on schema 'INFORMATION_SCHEMA' "
116
+ """
117
+ schema_query = re.sub(
118
+ r'\s+',
119
+ ' ',
120
+ _SCHEMA_QUERY.format(catalog=catalog, schema=schema, table=table),
121
+ )
122
+ try:
123
+ logger.debug(f"Fetching schema using query: \n`{schema_query}`")
124
+ logger.info(f"Fetching Schema: Started at: {datetime.now()}")
125
+ schema_metadata = self.reader(schema_query).load().collect()
126
+ logger.info(f"Schema fetched successfully. Completed at: {datetime.now()}")
127
+ return [Schema(field.COLUMN_NAME.lower(), field.DATA_TYPE.lower()) for field in schema_metadata]
128
+ except (RuntimeError, PySparkException) as e:
129
+ return self.log_and_throw_exception(e, "schema", schema_query)
130
+
131
+ def reader(self, query: str, prepare_query_str="") -> DataFrameReader:
132
+ return self._get_jdbc_reader(query, self.get_jdbc_url, self._DRIVER, prepare_query_str)