databricks-labs-lakebridge 0.10.6__py3-none-any.whl → 0.10.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. databricks/labs/lakebridge/__about__.py +1 -1
  2. databricks/labs/lakebridge/analyzer/__init__.py +0 -0
  3. databricks/labs/lakebridge/analyzer/lakebridge_analyzer.py +95 -0
  4. databricks/labs/lakebridge/base_install.py +24 -3
  5. databricks/labs/lakebridge/cli.py +19 -53
  6. databricks/labs/lakebridge/contexts/application.py +7 -0
  7. databricks/labs/lakebridge/deployment/job.py +2 -2
  8. databricks/labs/lakebridge/helpers/file_utils.py +36 -0
  9. databricks/labs/lakebridge/install.py +187 -157
  10. databricks/labs/lakebridge/reconcile/compare.py +70 -33
  11. databricks/labs/lakebridge/reconcile/connectors/data_source.py +19 -0
  12. databricks/labs/lakebridge/reconcile/connectors/databricks.py +11 -1
  13. databricks/labs/lakebridge/reconcile/connectors/dialect_utils.py +126 -0
  14. databricks/labs/lakebridge/reconcile/connectors/models.py +7 -0
  15. databricks/labs/lakebridge/reconcile/connectors/oracle.py +11 -1
  16. databricks/labs/lakebridge/reconcile/connectors/snowflake.py +14 -2
  17. databricks/labs/lakebridge/reconcile/connectors/tsql.py +27 -2
  18. databricks/labs/lakebridge/reconcile/constants.py +4 -3
  19. databricks/labs/lakebridge/reconcile/execute.py +9 -810
  20. databricks/labs/lakebridge/reconcile/normalize_recon_config_service.py +133 -0
  21. databricks/labs/lakebridge/reconcile/query_builder/base.py +3 -7
  22. databricks/labs/lakebridge/reconcile/recon_config.py +3 -0
  23. databricks/labs/lakebridge/reconcile/recon_output_config.py +2 -1
  24. databricks/labs/lakebridge/reconcile/reconciliation.py +508 -0
  25. databricks/labs/lakebridge/reconcile/schema_compare.py +26 -19
  26. databricks/labs/lakebridge/reconcile/trigger_recon_aggregate_service.py +98 -0
  27. databricks/labs/lakebridge/reconcile/trigger_recon_service.py +253 -0
  28. databricks/labs/lakebridge/reconcile/utils.py +38 -0
  29. databricks/labs/lakebridge/transpiler/lsp/lsp_engine.py +45 -60
  30. databricks/labs/lakebridge/transpiler/sqlglot/dialect_utils.py +2 -0
  31. databricks/labs/lakebridge/transpiler/transpile_engine.py +0 -18
  32. {databricks_labs_lakebridge-0.10.6.dist-info → databricks_labs_lakebridge-0.10.7.dist-info}/METADATA +1 -1
  33. {databricks_labs_lakebridge-0.10.6.dist-info → databricks_labs_lakebridge-0.10.7.dist-info}/RECORD +37 -28
  34. {databricks_labs_lakebridge-0.10.6.dist-info → databricks_labs_lakebridge-0.10.7.dist-info}/WHEEL +0 -0
  35. {databricks_labs_lakebridge-0.10.6.dist-info → databricks_labs_lakebridge-0.10.7.dist-info}/entry_points.txt +0 -0
  36. {databricks_labs_lakebridge-0.10.6.dist-info → databricks_labs_lakebridge-0.10.7.dist-info}/licenses/LICENSE +0 -0
  37. {databricks_labs_lakebridge-0.10.6.dist-info → databricks_labs_lakebridge-0.10.7.dist-info}/licenses/NOTICE +0 -0
@@ -3,6 +3,7 @@ from abc import ABC, abstractmethod
3
3
 
4
4
  from pyspark.sql import DataFrame
5
5
 
6
+ from databricks.labs.lakebridge.reconcile.connectors.models import NormalizedIdentifier
6
7
  from databricks.labs.lakebridge.reconcile.exception import DataSourceRuntimeException
7
8
  from databricks.labs.lakebridge.reconcile.recon_config import JdbcReaderOptions, Schema
8
9
 
@@ -31,12 +32,27 @@ class DataSource(ABC):
31
32
  ) -> list[Schema]:
32
33
  return NotImplemented
33
34
 
35
+ @abstractmethod
36
+ def normalize_identifier(self, identifier: str) -> NormalizedIdentifier:
37
+ pass
38
+
34
39
  @classmethod
35
40
  def log_and_throw_exception(cls, exception: Exception, fetch_type: str, query: str):
36
41
  error_msg = f"Runtime exception occurred while fetching {fetch_type} using {query} : {exception}"
37
42
  logger.warning(error_msg)
38
43
  raise DataSourceRuntimeException(error_msg) from exception
39
44
 
45
+ def _map_meta_column(self, meta_column) -> Schema:
46
+ """Create a normalized Schema DTO from the database metadata
47
+
48
+ Used in the implementations of get_schema to build a Schema DTO from the `INFORMATION_SCHEMA` query result.
49
+ The returned Schema is normalized in case the database is having columns with special characters and standardize
50
+ """
51
+ name = meta_column.col_name
52
+ dtype = meta_column.data_type.strip().lower()
53
+ normalized = self.normalize_identifier(name)
54
+ return Schema(normalized.ansi_normalized, dtype, normalized.ansi_normalized, normalized.source_normalized)
55
+
40
56
 
41
57
  class MockDataSource(DataSource):
42
58
 
@@ -70,3 +86,6 @@ class MockDataSource(DataSource):
70
86
  if not mock_schema:
71
87
  return self.log_and_throw_exception(self._exception, "schema", f"({catalog}, {schema}, {table})")
72
88
  return mock_schema
89
+
90
+ def normalize_identifier(self, identifier: str) -> NormalizedIdentifier:
91
+ return NormalizedIdentifier(identifier, identifier)
@@ -8,7 +8,9 @@ from pyspark.sql.functions import col
8
8
  from sqlglot import Dialect
9
9
 
10
10
  from databricks.labs.lakebridge.reconcile.connectors.data_source import DataSource
11
+ from databricks.labs.lakebridge.reconcile.connectors.models import NormalizedIdentifier
11
12
  from databricks.labs.lakebridge.reconcile.connectors.secrets import SecretsMixin
13
+ from databricks.labs.lakebridge.reconcile.connectors.dialect_utils import DialectUtils
12
14
  from databricks.labs.lakebridge.reconcile.recon_config import JdbcReaderOptions, Schema
13
15
  from databricks.sdk import WorkspaceClient
14
16
 
@@ -35,6 +37,7 @@ def _get_schema_query(catalog: str, schema: str, table: str):
35
37
 
36
38
 
37
39
  class DatabricksDataSource(DataSource, SecretsMixin):
40
+ _IDENTIFIER_DELIMITER = "`"
38
41
 
39
42
  def __init__(
40
43
  self,
@@ -82,6 +85,13 @@ class DatabricksDataSource(DataSource, SecretsMixin):
82
85
  logger.info(f"Fetching Schema: Started at: {datetime.now()}")
83
86
  schema_metadata = self._spark.sql(schema_query).where("col_name not like '#%'").distinct().collect()
84
87
  logger.info(f"Schema fetched successfully. Completed at: {datetime.now()}")
85
- return [Schema(field.col_name.lower(), field.data_type.lower()) for field in schema_metadata]
88
+ return [self._map_meta_column(field) for field in schema_metadata]
86
89
  except (RuntimeError, PySparkException) as e:
87
90
  return self.log_and_throw_exception(e, "schema", schema_query)
91
+
92
+ def normalize_identifier(self, identifier: str) -> NormalizedIdentifier:
93
+ return DialectUtils.normalize_identifier(
94
+ identifier,
95
+ source_start_delimiter=DatabricksDataSource._IDENTIFIER_DELIMITER,
96
+ source_end_delimiter=DatabricksDataSource._IDENTIFIER_DELIMITER,
97
+ )
@@ -0,0 +1,126 @@
1
+ from databricks.labs.lakebridge.reconcile.connectors.models import NormalizedIdentifier
2
+
3
+
4
+ class DialectUtils:
5
+ _ANSI_IDENTIFIER_DELIMITER = "`"
6
+
7
+ @staticmethod
8
+ def unnormalize_identifier(identifier: str) -> str:
9
+ """Return an ansi identifier without the outer backticks.
10
+
11
+ Use this at your own risk as the missing outer backticks will result in bugs.
12
+ E.g. <`mary's lamb`> is returned <mary's lamb> so the outer backticks are needed.
13
+ This is useful for scenarios where the returned identifier will be part of another delimited identifier.
14
+
15
+ :param identifier: a database identifier
16
+ :return: ansi identifier without the outer backticks
17
+ """
18
+ ansi = DialectUtils.ansi_normalize_identifier(identifier)
19
+ unescape = (
20
+ DialectUtils._unescape_source_end_delimiter(ansi[1:-1], DialectUtils._ANSI_IDENTIFIER_DELIMITER)
21
+ if ansi
22
+ else ansi
23
+ )
24
+ return unescape
25
+
26
+ @staticmethod
27
+ def ansi_normalize_identifier(identifier: str) -> str:
28
+ return DialectUtils.normalize_identifier(
29
+ identifier, DialectUtils._ANSI_IDENTIFIER_DELIMITER, DialectUtils._ANSI_IDENTIFIER_DELIMITER
30
+ ).ansi_normalized
31
+
32
+ @staticmethod
33
+ def normalize_identifier(
34
+ identifier: str, source_start_delimiter: str, source_end_delimiter: str
35
+ ) -> NormalizedIdentifier:
36
+ identifier = identifier.strip().lower()
37
+
38
+ ansi = DialectUtils._normalize_identifier_source_agnostic(
39
+ identifier,
40
+ source_start_delimiter,
41
+ source_end_delimiter,
42
+ DialectUtils._ANSI_IDENTIFIER_DELIMITER,
43
+ DialectUtils._ANSI_IDENTIFIER_DELIMITER,
44
+ )
45
+
46
+ # Input was already ansi normalized
47
+ if ansi == identifier:
48
+ source = DialectUtils._normalize_identifier_source_agnostic(
49
+ identifier,
50
+ DialectUtils._ANSI_IDENTIFIER_DELIMITER,
51
+ DialectUtils._ANSI_IDENTIFIER_DELIMITER,
52
+ source_start_delimiter,
53
+ source_end_delimiter,
54
+ )
55
+
56
+ # Ansi has backticks escaped which has to be unescaped for other delimiters and escape source end delimiters
57
+ if source != ansi:
58
+ source = DialectUtils._unescape_source_end_delimiter(source, DialectUtils._ANSI_IDENTIFIER_DELIMITER)
59
+ source = (
60
+ DialectUtils._escape_source_end_delimiter(source, source_start_delimiter, source_end_delimiter)
61
+ if source
62
+ else source
63
+ )
64
+ else:
65
+ # Make sure backticks are escaped properly for ansi and source end delimiters are unescaped
66
+ ansi = DialectUtils._unescape_source_end_delimiter(ansi, source_end_delimiter)
67
+ ansi = DialectUtils._escape_backticks(ansi) if ansi else ansi
68
+
69
+ if source_end_delimiter != DialectUtils._ANSI_IDENTIFIER_DELIMITER:
70
+ ansi = DialectUtils._unescape_source_end_delimiter(ansi, source_end_delimiter)
71
+
72
+ source = DialectUtils._normalize_identifier_source_agnostic(
73
+ identifier, source_start_delimiter, source_end_delimiter, source_start_delimiter, source_end_delimiter
74
+ )
75
+
76
+ # Make sure source end delimiter is escaped else nothing as it was already normalized
77
+ if source != identifier:
78
+ source = (
79
+ DialectUtils._escape_source_end_delimiter(source, source_start_delimiter, source_end_delimiter)
80
+ if source
81
+ else source
82
+ )
83
+
84
+ return NormalizedIdentifier(ansi, source)
85
+
86
+ @staticmethod
87
+ def _normalize_identifier_source_agnostic(
88
+ identifier: str,
89
+ source_start_delimiter: str,
90
+ source_end_delimiter: str,
91
+ expected_source_start_delimiter: str,
92
+ expected_source_end_delimiter: str,
93
+ ) -> str:
94
+ if identifier == "" or identifier is None:
95
+ return ""
96
+
97
+ if DialectUtils.is_already_delimited(
98
+ identifier, expected_source_start_delimiter, expected_source_end_delimiter
99
+ ):
100
+ return identifier
101
+
102
+ if DialectUtils.is_already_delimited(identifier, source_start_delimiter, source_end_delimiter):
103
+ stripped_identifier = identifier.removeprefix(source_start_delimiter).removesuffix(source_end_delimiter)
104
+ else:
105
+ stripped_identifier = identifier
106
+ return f"{expected_source_start_delimiter}{stripped_identifier}{expected_source_end_delimiter}"
107
+
108
+ @staticmethod
109
+ def is_already_delimited(identifier: str, start_delimiter: str, end_delimiter: str) -> bool:
110
+ return identifier.startswith(start_delimiter) and identifier.endswith(end_delimiter)
111
+
112
+ @staticmethod
113
+ def _escape_backticks(identifier: str) -> str:
114
+ identifier = identifier[1:-1]
115
+ identifier = identifier.replace("`", "``")
116
+ return f"`{identifier}`"
117
+
118
+ @staticmethod
119
+ def _unescape_source_end_delimiter(identifier: str, source_end_delimiter: str) -> str:
120
+ return identifier.replace(f"{source_end_delimiter}{source_end_delimiter}", source_end_delimiter)
121
+
122
+ @staticmethod
123
+ def _escape_source_end_delimiter(identifier: str, start_end_delimiter, source_end_delimiter: str) -> str:
124
+ identifier = identifier[1:-1]
125
+ identifier = identifier.replace(source_end_delimiter, f"{source_end_delimiter}{source_end_delimiter}")
126
+ return f"{start_end_delimiter}{identifier}{source_end_delimiter}"
@@ -0,0 +1,7 @@
1
+ import dataclasses
2
+
3
+
4
+ @dataclasses.dataclass
5
+ class NormalizedIdentifier:
6
+ ansi_normalized: str
7
+ source_normalized: str
@@ -9,7 +9,9 @@ from sqlglot import Dialect
9
9
 
10
10
  from databricks.labs.lakebridge.reconcile.connectors.data_source import DataSource
11
11
  from databricks.labs.lakebridge.reconcile.connectors.jdbc_reader import JDBCReaderMixin
12
+ from databricks.labs.lakebridge.reconcile.connectors.models import NormalizedIdentifier
12
13
  from databricks.labs.lakebridge.reconcile.connectors.secrets import SecretsMixin
14
+ from databricks.labs.lakebridge.reconcile.connectors.dialect_utils import DialectUtils
13
15
  from databricks.labs.lakebridge.reconcile.recon_config import JdbcReaderOptions, Schema
14
16
  from databricks.sdk import WorkspaceClient
15
17
 
@@ -18,6 +20,7 @@ logger = logging.getLogger(__name__)
18
20
 
19
21
  class OracleDataSource(DataSource, SecretsMixin, JDBCReaderMixin):
20
22
  _DRIVER = "oracle"
23
+ _IDENTIFIER_DELIMITER = "\""
21
24
  _SCHEMA_QUERY = """select column_name, case when (data_precision is not null
22
25
  and data_scale <> 0)
23
26
  then data_type || '(' || data_precision || ',' || data_scale || ')'
@@ -91,7 +94,7 @@ class OracleDataSource(DataSource, SecretsMixin, JDBCReaderMixin):
91
94
  schema_metadata = df.select([col(c).alias(c.lower()) for c in df.columns]).collect()
92
95
  logger.info(f"Schema fetched successfully. Completed at: {datetime.now()}")
93
96
  logger.debug(f"schema_metadata: ${schema_metadata}")
94
- return [Schema(field.column_name.lower(), field.data_type.lower()) for field in schema_metadata]
97
+ return [self._map_meta_column(field) for field in schema_metadata]
95
98
  except (RuntimeError, PySparkException) as e:
96
99
  return self.log_and_throw_exception(e, "schema", schema_query)
97
100
 
@@ -106,3 +109,10 @@ class OracleDataSource(DataSource, SecretsMixin, JDBCReaderMixin):
106
109
 
107
110
  def reader(self, query: str) -> DataFrameReader:
108
111
  return self._get_jdbc_reader(query, self.get_jdbc_url, OracleDataSource._DRIVER)
112
+
113
+ def normalize_identifier(self, identifier: str) -> NormalizedIdentifier:
114
+ return DialectUtils.normalize_identifier(
115
+ identifier,
116
+ source_start_delimiter=OracleDataSource._IDENTIFIER_DELIMITER,
117
+ source_end_delimiter=OracleDataSource._IDENTIFIER_DELIMITER,
118
+ )
@@ -11,7 +11,9 @@ from cryptography.hazmat.primitives import serialization
11
11
 
12
12
  from databricks.labs.lakebridge.reconcile.connectors.data_source import DataSource
13
13
  from databricks.labs.lakebridge.reconcile.connectors.jdbc_reader import JDBCReaderMixin
14
+ from databricks.labs.lakebridge.reconcile.connectors.models import NormalizedIdentifier
14
15
  from databricks.labs.lakebridge.reconcile.connectors.secrets import SecretsMixin
16
+ from databricks.labs.lakebridge.reconcile.connectors.dialect_utils import DialectUtils
15
17
  from databricks.labs.lakebridge.reconcile.exception import InvalidSnowflakePemPrivateKey
16
18
  from databricks.labs.lakebridge.reconcile.recon_config import JdbcReaderOptions, Schema
17
19
  from databricks.sdk import WorkspaceClient
@@ -22,6 +24,8 @@ logger = logging.getLogger(__name__)
22
24
 
23
25
  class SnowflakeDataSource(DataSource, SecretsMixin, JDBCReaderMixin):
24
26
  _DRIVER = "snowflake"
27
+ _IDENTIFIER_DELIMITER = "\""
28
+
25
29
  """
26
30
  * INFORMATION_SCHEMA:
27
31
  - see https://docs.snowflake.com/en/sql-reference/info-schema#considerations-for-replacing-show-commands-with-information-schema-views
@@ -144,9 +148,10 @@ class SnowflakeDataSource(DataSource, SecretsMixin, JDBCReaderMixin):
144
148
  try:
145
149
  logger.debug(f"Fetching schema using query: \n`{schema_query}`")
146
150
  logger.info(f"Fetching Schema: Started at: {datetime.now()}")
147
- schema_metadata = self.reader(schema_query).load().collect()
151
+ df = self.reader(schema_query).load()
152
+ schema_metadata = df.select([col(c).alias(c.lower()) for c in df.columns]).collect()
148
153
  logger.info(f"Schema fetched successfully. Completed at: {datetime.now()}")
149
- return [Schema(field.COLUMN_NAME.lower(), field.DATA_TYPE.lower()) for field in schema_metadata]
154
+ return [self._map_meta_column(field) for field in schema_metadata]
150
155
  except (RuntimeError, PySparkException) as e:
151
156
  return self.log_and_throw_exception(e, "schema", schema_query)
152
157
 
@@ -171,3 +176,10 @@ class SnowflakeDataSource(DataSource, SecretsMixin, JDBCReaderMixin):
171
176
  raise NotFound(message) from e
172
177
 
173
178
  return self._spark.read.format("snowflake").option("dbtable", f"({query}) as tmp").options(**options)
179
+
180
+ def normalize_identifier(self, identifier: str) -> NormalizedIdentifier:
181
+ return DialectUtils.normalize_identifier(
182
+ identifier,
183
+ source_start_delimiter=SnowflakeDataSource._IDENTIFIER_DELIMITER,
184
+ source_end_delimiter=SnowflakeDataSource._IDENTIFIER_DELIMITER,
185
+ )
@@ -9,7 +9,9 @@ from sqlglot import Dialect
9
9
 
10
10
  from databricks.labs.lakebridge.reconcile.connectors.data_source import DataSource
11
11
  from databricks.labs.lakebridge.reconcile.connectors.jdbc_reader import JDBCReaderMixin
12
+ from databricks.labs.lakebridge.reconcile.connectors.models import NormalizedIdentifier
12
13
  from databricks.labs.lakebridge.reconcile.connectors.secrets import SecretsMixin
14
+ from databricks.labs.lakebridge.reconcile.connectors.dialect_utils import DialectUtils
13
15
  from databricks.labs.lakebridge.reconcile.recon_config import JdbcReaderOptions, Schema
14
16
  from databricks.sdk import WorkspaceClient
15
17
 
@@ -49,6 +51,7 @@ _SCHEMA_QUERY = """SELECT
49
51
 
50
52
  class TSQLServerDataSource(DataSource, SecretsMixin, JDBCReaderMixin):
51
53
  _DRIVER = "sqlserver"
54
+ _IDENTIFIER_DELIMITER = {"prefix": "[", "suffix": "]"}
52
55
 
53
56
  def __init__(
54
57
  self,
@@ -122,11 +125,33 @@ class TSQLServerDataSource(DataSource, SecretsMixin, JDBCReaderMixin):
122
125
  try:
123
126
  logger.debug(f"Fetching schema using query: \n`{schema_query}`")
124
127
  logger.info(f"Fetching Schema: Started at: {datetime.now()}")
125
- schema_metadata = self.reader(schema_query).load().collect()
128
+ df = self.reader(schema_query).load()
129
+ schema_metadata = df.select([col(c).alias(c.lower()) for c in df.columns]).collect()
126
130
  logger.info(f"Schema fetched successfully. Completed at: {datetime.now()}")
127
- return [Schema(field.COLUMN_NAME.lower(), field.DATA_TYPE.lower()) for field in schema_metadata]
131
+ return [self._map_meta_column(field) for field in schema_metadata]
128
132
  except (RuntimeError, PySparkException) as e:
129
133
  return self.log_and_throw_exception(e, "schema", schema_query)
130
134
 
131
135
  def reader(self, query: str, prepare_query_str="") -> DataFrameReader:
132
136
  return self._get_jdbc_reader(query, self.get_jdbc_url, self._DRIVER, prepare_query_str)
137
+
138
+ def normalize_identifier(self, identifier: str) -> NormalizedIdentifier:
139
+ return DialectUtils.normalize_identifier(
140
+ TSQLServerDataSource._normalize_quotes(identifier),
141
+ source_start_delimiter=TSQLServerDataSource._IDENTIFIER_DELIMITER["prefix"],
142
+ source_end_delimiter=TSQLServerDataSource._IDENTIFIER_DELIMITER["suffix"],
143
+ )
144
+
145
+ @staticmethod
146
+ def _normalize_quotes(identifier: str):
147
+ if DialectUtils.is_already_delimited(identifier, '"', '"'):
148
+ identifier = identifier[1:-1]
149
+ identifier = identifier.replace('""', '"')
150
+ identifier = (
151
+ TSQLServerDataSource._IDENTIFIER_DELIMITER["prefix"]
152
+ + identifier
153
+ + TSQLServerDataSource._IDENTIFIER_DELIMITER["suffix"]
154
+ )
155
+ return identifier
156
+
157
+ return identifier
@@ -15,10 +15,11 @@ class AutoName(Enum):
15
15
 
16
16
 
17
17
  class ReconSourceType(AutoName):
18
- SNOWFLAKE = auto()
19
- ORACLE = auto()
20
18
  DATABRICKS = auto()
21
- TSQL = auto()
19
+ MSSQL = auto()
20
+ ORACLE = auto()
21
+ SNOWFLAKE = auto()
22
+ SYNAPSE = auto()
22
23
 
23
24
 
24
25
  class ReconReportType(AutoName):