databricks-labs-lakebridge 0.10.5__py3-none-any.whl → 0.10.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. databricks/labs/lakebridge/__about__.py +1 -1
  2. databricks/labs/lakebridge/analyzer/__init__.py +0 -0
  3. databricks/labs/lakebridge/analyzer/lakebridge_analyzer.py +95 -0
  4. databricks/labs/lakebridge/base_install.py +24 -3
  5. databricks/labs/lakebridge/cli.py +57 -72
  6. databricks/labs/lakebridge/config.py +1 -1
  7. databricks/labs/lakebridge/contexts/application.py +11 -4
  8. databricks/labs/lakebridge/deployment/dashboard.py +2 -1
  9. databricks/labs/lakebridge/deployment/installation.py +11 -11
  10. databricks/labs/lakebridge/deployment/job.py +2 -2
  11. databricks/labs/lakebridge/helpers/file_utils.py +36 -0
  12. databricks/labs/lakebridge/install.py +228 -278
  13. databricks/labs/lakebridge/reconcile/compare.py +70 -33
  14. databricks/labs/lakebridge/reconcile/connectors/data_source.py +19 -0
  15. databricks/labs/lakebridge/reconcile/connectors/databricks.py +11 -1
  16. databricks/labs/lakebridge/reconcile/connectors/dialect_utils.py +126 -0
  17. databricks/labs/lakebridge/reconcile/connectors/models.py +7 -0
  18. databricks/labs/lakebridge/reconcile/connectors/oracle.py +11 -1
  19. databricks/labs/lakebridge/reconcile/connectors/snowflake.py +14 -2
  20. databricks/labs/lakebridge/reconcile/connectors/tsql.py +27 -2
  21. databricks/labs/lakebridge/reconcile/constants.py +4 -3
  22. databricks/labs/lakebridge/reconcile/execute.py +9 -810
  23. databricks/labs/lakebridge/reconcile/normalize_recon_config_service.py +133 -0
  24. databricks/labs/lakebridge/reconcile/query_builder/base.py +3 -7
  25. databricks/labs/lakebridge/reconcile/recon_config.py +3 -0
  26. databricks/labs/lakebridge/reconcile/recon_output_config.py +2 -1
  27. databricks/labs/lakebridge/reconcile/reconciliation.py +508 -0
  28. databricks/labs/lakebridge/reconcile/schema_compare.py +26 -19
  29. databricks/labs/lakebridge/reconcile/trigger_recon_aggregate_service.py +98 -0
  30. databricks/labs/lakebridge/reconcile/trigger_recon_service.py +253 -0
  31. databricks/labs/lakebridge/reconcile/utils.py +38 -0
  32. databricks/labs/lakebridge/transpiler/lsp/lsp_engine.py +48 -63
  33. databricks/labs/lakebridge/transpiler/repository.py +123 -0
  34. databricks/labs/lakebridge/transpiler/sqlglot/dialect_utils.py +2 -0
  35. databricks/labs/lakebridge/transpiler/transpile_engine.py +0 -18
  36. {databricks_labs_lakebridge-0.10.5.dist-info → databricks_labs_lakebridge-0.10.7.dist-info}/METADATA +1 -1
  37. {databricks_labs_lakebridge-0.10.5.dist-info → databricks_labs_lakebridge-0.10.7.dist-info}/RECORD +41 -31
  38. {databricks_labs_lakebridge-0.10.5.dist-info → databricks_labs_lakebridge-0.10.7.dist-info}/WHEEL +0 -0
  39. {databricks_labs_lakebridge-0.10.5.dist-info → databricks_labs_lakebridge-0.10.7.dist-info}/entry_points.txt +0 -0
  40. {databricks_labs_lakebridge-0.10.5.dist-info → databricks_labs_lakebridge-0.10.7.dist-info}/licenses/LICENSE +0 -0
  41. {databricks_labs_lakebridge-0.10.5.dist-info → databricks_labs_lakebridge-0.10.7.dist-info}/licenses/NOTICE +0 -0
@@ -3,6 +3,7 @@ from functools import reduce
3
3
  from pyspark.sql import DataFrame, SparkSession
4
4
  from pyspark.sql.functions import col, expr, lit
5
5
 
6
+ from databricks.labs.lakebridge.reconcile.connectors.dialect_utils import DialectUtils
6
7
  from databricks.labs.lakebridge.reconcile.exception import ColumnMismatchException
7
8
  from databricks.labs.lakebridge.reconcile.recon_capture import (
8
9
  ReconIntermediatePersist,
@@ -22,7 +23,7 @@ _HASH_COLUMN_NAME = "hash_value_recon"
22
23
  _SAMPLE_ROWS = 50
23
24
 
24
25
 
25
- def raise_column_mismatch_exception(msg: str, source_missing: list[str], target_missing: list[str]) -> Exception:
26
+ def _raise_column_mismatch_exception(msg: str, source_missing: list[str], target_missing: list[str]) -> Exception:
26
27
  error_msg = (
27
28
  f"{msg}\n"
28
29
  f"columns missing in source: {','.join(source_missing) if source_missing else None}\n"
@@ -33,12 +34,25 @@ def raise_column_mismatch_exception(msg: str, source_missing: list[str], target_
33
34
 
34
35
  def _generate_join_condition(source_alias, target_alias, key_columns):
35
36
  conditions = [
36
- col(f"{source_alias}.{key_column}").eqNullSafe(col(f"{target_alias}.{key_column}"))
37
+ col(f"{source_alias}.{DialectUtils.ansi_normalize_identifier(key_column)}").eqNullSafe(
38
+ col(f"{target_alias}.{DialectUtils.ansi_normalize_identifier(key_column)}")
39
+ )
37
40
  for key_column in key_columns
38
41
  ]
39
42
  return reduce(lambda a, b: a & b, conditions)
40
43
 
41
44
 
45
+ def _build_column_selector(table_name, column_name):
46
+ alias = DialectUtils.ansi_normalize_identifier(f"{table_name}_{DialectUtils.unnormalize_identifier(column_name)}")
47
+ return f'{table_name}.{DialectUtils.ansi_normalize_identifier(column_name)} as {alias}'
48
+
49
+
50
+ def _build_mismatch_column(table, column):
51
+ return col(DialectUtils.ansi_normalize_identifier(column)).alias(
52
+ DialectUtils.unnormalize_identifier(column.replace(f'{table}_', '').lower())
53
+ )
54
+
55
+
42
56
  def reconcile_data(
43
57
  source: DataFrame,
44
58
  target: DataFrame,
@@ -59,14 +73,14 @@ def reconcile_data(
59
73
  how="full",
60
74
  )
61
75
  .selectExpr(
62
- *[f'{source_alias}.{col_name} as {source_alias}_{col_name}' for col_name in source.columns],
63
- *[f'{target_alias}.{col_name} as {target_alias}_{col_name}' for col_name in target.columns],
76
+ *[f'{_build_column_selector(source_alias, col_name)}' for col_name in source.columns],
77
+ *[f'{_build_column_selector(target_alias, col_name)}' for col_name in target.columns],
64
78
  )
65
79
  )
66
80
 
67
81
  # Write unmatched df to volume
68
82
  df = ReconIntermediatePersist(spark, path).write_and_read_unmatched_df_with_volumes(df)
69
- logger.warning(f"Unmatched data is written to {path} successfully")
83
+ logger.warning(f"Unmatched data was written to {path} successfully")
70
84
 
71
85
  mismatch = _get_mismatch_data(df, source_alias, target_alias) if report_type in {"all", "data"} else None
72
86
 
@@ -74,24 +88,24 @@ def reconcile_data(
74
88
  df.filter(col(f"{source_alias}_{_HASH_COLUMN_NAME}").isNull())
75
89
  .select(
76
90
  *[
77
- col(col_name).alias(col_name.replace(f'{target_alias}_', '').lower())
91
+ _build_mismatch_column(target_alias, col_name)
78
92
  for col_name in df.columns
79
93
  if col_name.startswith(f'{target_alias}_')
80
94
  ]
81
95
  )
82
- .drop(_HASH_COLUMN_NAME)
96
+ .drop(f"{_HASH_COLUMN_NAME}")
83
97
  )
84
98
 
85
99
  missing_in_tgt = (
86
100
  df.filter(col(f"{target_alias}_{_HASH_COLUMN_NAME}").isNull())
87
101
  .select(
88
102
  *[
89
- col(col_name).alias(col_name.replace(f'{source_alias}_', '').lower())
103
+ _build_mismatch_column(source_alias, col_name)
90
104
  for col_name in df.columns
91
105
  if col_name.startswith(f'{source_alias}_')
92
106
  ]
93
107
  )
94
- .drop(_HASH_COLUMN_NAME)
108
+ .drop(f"{_HASH_COLUMN_NAME}")
95
109
  )
96
110
  mismatch_count = 0
97
111
  if mismatch:
@@ -123,23 +137,27 @@ def _get_mismatch_data(df: DataFrame, src_alias: str, tgt_alias: str) -> DataFra
123
137
  .filter(col("hash_match") == lit(False))
124
138
  .select(
125
139
  *[
126
- col(col_name).alias(col_name.replace(f'{src_alias}_', '').lower())
140
+ _build_mismatch_column(src_alias, col_name)
127
141
  for col_name in df.columns
128
142
  if col_name.startswith(f'{src_alias}_')
129
143
  ]
130
144
  )
131
- .drop(_HASH_COLUMN_NAME)
145
+ .drop(f"{_HASH_COLUMN_NAME}")
132
146
  )
133
147
 
134
148
 
135
- def _convert_columns_to_lowercase(df: DataFrame) -> DataFrame:
136
- lowercased_columns = [col(column).alias(column.lower()) for column in df.columns]
137
- return df.select(*lowercased_columns)
149
+ def _build_capture_df(df: DataFrame) -> DataFrame:
150
+ columns = [
151
+ col(DialectUtils.ansi_normalize_identifier(column)).alias(DialectUtils.unnormalize_identifier(column))
152
+ for column in df.columns
153
+ ]
154
+ return df.select(*columns)
138
155
 
139
156
 
140
157
  def capture_mismatch_data_and_columns(source: DataFrame, target: DataFrame, key_columns: list[str]) -> MismatchOutput:
141
- source_df = _convert_columns_to_lowercase(source)
142
- target_df = _convert_columns_to_lowercase(target)
158
+ source_df = _build_capture_df(source)
159
+ target_df = _build_capture_df(target)
160
+ unnormalized_key_columns = [DialectUtils.unnormalize_identifier(column) for column in key_columns]
143
161
 
144
162
  source_columns = source_df.columns
145
163
  target_columns = target_df.columns
@@ -148,10 +166,10 @@ def capture_mismatch_data_and_columns(source: DataFrame, target: DataFrame, key_
148
166
  message = "source and target should have same columns for capturing the mismatch data"
149
167
  source_missing = [column for column in target_columns if column not in source_columns]
150
168
  target_missing = [column for column in source_columns if column not in target_columns]
151
- raise raise_column_mismatch_exception(message, source_missing, target_missing)
169
+ raise _raise_column_mismatch_exception(message, source_missing, target_missing)
152
170
 
153
- check_columns = [column for column in source_columns if column not in key_columns]
154
- mismatch_df = _get_mismatch_df(source_df, target_df, key_columns, check_columns)
171
+ check_columns = [column for column in source_columns if column not in unnormalized_key_columns]
172
+ mismatch_df = _get_mismatch_df(source_df, target_df, unnormalized_key_columns, check_columns)
155
173
  mismatch_columns = _get_mismatch_columns(mismatch_df, check_columns)
156
174
  return MismatchOutput(mismatch_df, mismatch_columns)
157
175
 
@@ -167,31 +185,50 @@ def _get_mismatch_columns(df: DataFrame, columns: list[str]):
167
185
  return mismatch_columns
168
186
 
169
187
 
188
+ def _normalize_mismatch_df_col(column, suffix):
189
+ unnormalized = DialectUtils.unnormalize_identifier(column) + suffix
190
+ return DialectUtils.ansi_normalize_identifier(unnormalized)
191
+
192
+
193
+ def _unnormalize_mismatch_df_col(column, suffix):
194
+ unnormalized = DialectUtils.unnormalize_identifier(column) + suffix
195
+ return unnormalized
196
+
197
+
170
198
  def _get_mismatch_df(source: DataFrame, target: DataFrame, key_columns: list[str], column_list: list[str]):
171
- source_aliased = [col('base.' + column).alias(column + '_base') for column in column_list]
172
- target_aliased = [col('compare.' + column).alias(column + '_compare') for column in column_list]
199
+ source_aliased = [
200
+ col('base.' + DialectUtils.ansi_normalize_identifier(column)).alias(
201
+ _unnormalize_mismatch_df_col(column, '_base')
202
+ )
203
+ for column in column_list
204
+ ]
205
+ target_aliased = [
206
+ col('compare.' + DialectUtils.ansi_normalize_identifier(column)).alias(
207
+ _unnormalize_mismatch_df_col(column, '_compare')
208
+ )
209
+ for column in column_list
210
+ ]
173
211
 
174
- match_expr = [expr(f"{column}_base=={column}_compare").alias(column + "_match") for column in column_list]
175
- key_cols = [col(column) for column in key_columns]
212
+ match_expr = [
213
+ expr(f"{_normalize_mismatch_df_col(column,'_base')}=={_normalize_mismatch_df_col(column,'_compare')}").alias(
214
+ _unnormalize_mismatch_df_col(column, '_match')
215
+ )
216
+ for column in column_list
217
+ ]
218
+ key_cols = [col(DialectUtils.ansi_normalize_identifier(column)) for column in key_columns]
176
219
  select_expr = key_cols + source_aliased + target_aliased + match_expr
177
220
 
178
- filter_columns = " and ".join([column + "_match" for column in column_list])
179
- filter_expr = ~expr(filter_columns)
180
-
181
221
  logger.info(f"KEY COLUMNS: {key_columns}")
182
- logger.info(f"FILTER COLUMNS: {filter_expr}")
183
222
  logger.info(f"SELECT COLUMNS: {select_expr}")
184
223
 
185
224
  mismatch_df = (
186
225
  source.alias('base').join(other=target.alias('compare'), on=key_columns, how="inner").select(*select_expr)
187
226
  )
188
227
 
189
- compare_columns = [column for column in mismatch_df.columns if column not in key_columns]
190
- return mismatch_df.select(*key_columns + sorted(compare_columns))
191
-
192
-
193
- def alias_column_str(alias: str, columns: list[str]) -> list[str]:
194
- return [f"{alias}.{column}" for column in columns]
228
+ compare_columns = [
229
+ DialectUtils.ansi_normalize_identifier(column) for column in mismatch_df.columns if column not in key_columns
230
+ ]
231
+ return mismatch_df.select(*key_cols + sorted(compare_columns))
195
232
 
196
233
 
197
234
  def _generate_agg_join_condition(source_alias: str, target_alias: str, key_columns: list[str]):
@@ -3,6 +3,7 @@ from abc import ABC, abstractmethod
3
3
 
4
4
  from pyspark.sql import DataFrame
5
5
 
6
+ from databricks.labs.lakebridge.reconcile.connectors.models import NormalizedIdentifier
6
7
  from databricks.labs.lakebridge.reconcile.exception import DataSourceRuntimeException
7
8
  from databricks.labs.lakebridge.reconcile.recon_config import JdbcReaderOptions, Schema
8
9
 
@@ -31,12 +32,27 @@ class DataSource(ABC):
31
32
  ) -> list[Schema]:
32
33
  return NotImplemented
33
34
 
35
+ @abstractmethod
36
+ def normalize_identifier(self, identifier: str) -> NormalizedIdentifier:
37
+ pass
38
+
34
39
  @classmethod
35
40
  def log_and_throw_exception(cls, exception: Exception, fetch_type: str, query: str):
36
41
  error_msg = f"Runtime exception occurred while fetching {fetch_type} using {query} : {exception}"
37
42
  logger.warning(error_msg)
38
43
  raise DataSourceRuntimeException(error_msg) from exception
39
44
 
45
+ def _map_meta_column(self, meta_column) -> Schema:
46
+ """Create a normalized Schema DTO from the database metadata
47
+
48
+ Used in the implementations of get_schema to build a Schema DTO from the `INFORMATION_SCHEMA` query result.
49
+ The returned Schema is normalized in case the database is having columns with special characters and standardize
50
+ """
51
+ name = meta_column.col_name
52
+ dtype = meta_column.data_type.strip().lower()
53
+ normalized = self.normalize_identifier(name)
54
+ return Schema(normalized.ansi_normalized, dtype, normalized.ansi_normalized, normalized.source_normalized)
55
+
40
56
 
41
57
  class MockDataSource(DataSource):
42
58
 
@@ -70,3 +86,6 @@ class MockDataSource(DataSource):
70
86
  if not mock_schema:
71
87
  return self.log_and_throw_exception(self._exception, "schema", f"({catalog}, {schema}, {table})")
72
88
  return mock_schema
89
+
90
+ def normalize_identifier(self, identifier: str) -> NormalizedIdentifier:
91
+ return NormalizedIdentifier(identifier, identifier)
@@ -8,7 +8,9 @@ from pyspark.sql.functions import col
8
8
  from sqlglot import Dialect
9
9
 
10
10
  from databricks.labs.lakebridge.reconcile.connectors.data_source import DataSource
11
+ from databricks.labs.lakebridge.reconcile.connectors.models import NormalizedIdentifier
11
12
  from databricks.labs.lakebridge.reconcile.connectors.secrets import SecretsMixin
13
+ from databricks.labs.lakebridge.reconcile.connectors.dialect_utils import DialectUtils
12
14
  from databricks.labs.lakebridge.reconcile.recon_config import JdbcReaderOptions, Schema
13
15
  from databricks.sdk import WorkspaceClient
14
16
 
@@ -35,6 +37,7 @@ def _get_schema_query(catalog: str, schema: str, table: str):
35
37
 
36
38
 
37
39
  class DatabricksDataSource(DataSource, SecretsMixin):
40
+ _IDENTIFIER_DELIMITER = "`"
38
41
 
39
42
  def __init__(
40
43
  self,
@@ -82,6 +85,13 @@ class DatabricksDataSource(DataSource, SecretsMixin):
82
85
  logger.info(f"Fetching Schema: Started at: {datetime.now()}")
83
86
  schema_metadata = self._spark.sql(schema_query).where("col_name not like '#%'").distinct().collect()
84
87
  logger.info(f"Schema fetched successfully. Completed at: {datetime.now()}")
85
- return [Schema(field.col_name.lower(), field.data_type.lower()) for field in schema_metadata]
88
+ return [self._map_meta_column(field) for field in schema_metadata]
86
89
  except (RuntimeError, PySparkException) as e:
87
90
  return self.log_and_throw_exception(e, "schema", schema_query)
91
+
92
+ def normalize_identifier(self, identifier: str) -> NormalizedIdentifier:
93
+ return DialectUtils.normalize_identifier(
94
+ identifier,
95
+ source_start_delimiter=DatabricksDataSource._IDENTIFIER_DELIMITER,
96
+ source_end_delimiter=DatabricksDataSource._IDENTIFIER_DELIMITER,
97
+ )
@@ -0,0 +1,126 @@
1
+ from databricks.labs.lakebridge.reconcile.connectors.models import NormalizedIdentifier
2
+
3
+
4
+ class DialectUtils:
5
+ _ANSI_IDENTIFIER_DELIMITER = "`"
6
+
7
+ @staticmethod
8
+ def unnormalize_identifier(identifier: str) -> str:
9
+ """Return an ansi identifier without the outer backticks.
10
+
11
+ Use this at your own risk as the missing outer backticks will result in bugs.
12
+ E.g. <`mary's lamb`> is returned <mary's lamb> so the outer backticks are needed.
13
+ This is useful for scenarios where the returned identifier will be part of another delimited identifier.
14
+
15
+ :param identifier: a database identifier
16
+ :return: ansi identifier without the outer backticks
17
+ """
18
+ ansi = DialectUtils.ansi_normalize_identifier(identifier)
19
+ unescape = (
20
+ DialectUtils._unescape_source_end_delimiter(ansi[1:-1], DialectUtils._ANSI_IDENTIFIER_DELIMITER)
21
+ if ansi
22
+ else ansi
23
+ )
24
+ return unescape
25
+
26
+ @staticmethod
27
+ def ansi_normalize_identifier(identifier: str) -> str:
28
+ return DialectUtils.normalize_identifier(
29
+ identifier, DialectUtils._ANSI_IDENTIFIER_DELIMITER, DialectUtils._ANSI_IDENTIFIER_DELIMITER
30
+ ).ansi_normalized
31
+
32
+ @staticmethod
33
+ def normalize_identifier(
34
+ identifier: str, source_start_delimiter: str, source_end_delimiter: str
35
+ ) -> NormalizedIdentifier:
36
+ identifier = identifier.strip().lower()
37
+
38
+ ansi = DialectUtils._normalize_identifier_source_agnostic(
39
+ identifier,
40
+ source_start_delimiter,
41
+ source_end_delimiter,
42
+ DialectUtils._ANSI_IDENTIFIER_DELIMITER,
43
+ DialectUtils._ANSI_IDENTIFIER_DELIMITER,
44
+ )
45
+
46
+ # Input was already ansi normalized
47
+ if ansi == identifier:
48
+ source = DialectUtils._normalize_identifier_source_agnostic(
49
+ identifier,
50
+ DialectUtils._ANSI_IDENTIFIER_DELIMITER,
51
+ DialectUtils._ANSI_IDENTIFIER_DELIMITER,
52
+ source_start_delimiter,
53
+ source_end_delimiter,
54
+ )
55
+
56
+ # Ansi has backticks escaped which has to be unescaped for other delimiters and escape source end delimiters
57
+ if source != ansi:
58
+ source = DialectUtils._unescape_source_end_delimiter(source, DialectUtils._ANSI_IDENTIFIER_DELIMITER)
59
+ source = (
60
+ DialectUtils._escape_source_end_delimiter(source, source_start_delimiter, source_end_delimiter)
61
+ if source
62
+ else source
63
+ )
64
+ else:
65
+ # Make sure backticks are escaped properly for ansi and source end delimiters are unescaped
66
+ ansi = DialectUtils._unescape_source_end_delimiter(ansi, source_end_delimiter)
67
+ ansi = DialectUtils._escape_backticks(ansi) if ansi else ansi
68
+
69
+ if source_end_delimiter != DialectUtils._ANSI_IDENTIFIER_DELIMITER:
70
+ ansi = DialectUtils._unescape_source_end_delimiter(ansi, source_end_delimiter)
71
+
72
+ source = DialectUtils._normalize_identifier_source_agnostic(
73
+ identifier, source_start_delimiter, source_end_delimiter, source_start_delimiter, source_end_delimiter
74
+ )
75
+
76
+ # Make sure source end delimiter is escaped else nothing as it was already normalized
77
+ if source != identifier:
78
+ source = (
79
+ DialectUtils._escape_source_end_delimiter(source, source_start_delimiter, source_end_delimiter)
80
+ if source
81
+ else source
82
+ )
83
+
84
+ return NormalizedIdentifier(ansi, source)
85
+
86
+ @staticmethod
87
+ def _normalize_identifier_source_agnostic(
88
+ identifier: str,
89
+ source_start_delimiter: str,
90
+ source_end_delimiter: str,
91
+ expected_source_start_delimiter: str,
92
+ expected_source_end_delimiter: str,
93
+ ) -> str:
94
+ if identifier == "" or identifier is None:
95
+ return ""
96
+
97
+ if DialectUtils.is_already_delimited(
98
+ identifier, expected_source_start_delimiter, expected_source_end_delimiter
99
+ ):
100
+ return identifier
101
+
102
+ if DialectUtils.is_already_delimited(identifier, source_start_delimiter, source_end_delimiter):
103
+ stripped_identifier = identifier.removeprefix(source_start_delimiter).removesuffix(source_end_delimiter)
104
+ else:
105
+ stripped_identifier = identifier
106
+ return f"{expected_source_start_delimiter}{stripped_identifier}{expected_source_end_delimiter}"
107
+
108
+ @staticmethod
109
+ def is_already_delimited(identifier: str, start_delimiter: str, end_delimiter: str) -> bool:
110
+ return identifier.startswith(start_delimiter) and identifier.endswith(end_delimiter)
111
+
112
+ @staticmethod
113
+ def _escape_backticks(identifier: str) -> str:
114
+ identifier = identifier[1:-1]
115
+ identifier = identifier.replace("`", "``")
116
+ return f"`{identifier}`"
117
+
118
+ @staticmethod
119
+ def _unescape_source_end_delimiter(identifier: str, source_end_delimiter: str) -> str:
120
+ return identifier.replace(f"{source_end_delimiter}{source_end_delimiter}", source_end_delimiter)
121
+
122
+ @staticmethod
123
+ def _escape_source_end_delimiter(identifier: str, start_end_delimiter, source_end_delimiter: str) -> str:
124
+ identifier = identifier[1:-1]
125
+ identifier = identifier.replace(source_end_delimiter, f"{source_end_delimiter}{source_end_delimiter}")
126
+ return f"{start_end_delimiter}{identifier}{source_end_delimiter}"
@@ -0,0 +1,7 @@
1
+ import dataclasses
2
+
3
+
4
+ @dataclasses.dataclass
5
+ class NormalizedIdentifier:
6
+ ansi_normalized: str
7
+ source_normalized: str
@@ -9,7 +9,9 @@ from sqlglot import Dialect
9
9
 
10
10
  from databricks.labs.lakebridge.reconcile.connectors.data_source import DataSource
11
11
  from databricks.labs.lakebridge.reconcile.connectors.jdbc_reader import JDBCReaderMixin
12
+ from databricks.labs.lakebridge.reconcile.connectors.models import NormalizedIdentifier
12
13
  from databricks.labs.lakebridge.reconcile.connectors.secrets import SecretsMixin
14
+ from databricks.labs.lakebridge.reconcile.connectors.dialect_utils import DialectUtils
13
15
  from databricks.labs.lakebridge.reconcile.recon_config import JdbcReaderOptions, Schema
14
16
  from databricks.sdk import WorkspaceClient
15
17
 
@@ -18,6 +20,7 @@ logger = logging.getLogger(__name__)
18
20
 
19
21
  class OracleDataSource(DataSource, SecretsMixin, JDBCReaderMixin):
20
22
  _DRIVER = "oracle"
23
+ _IDENTIFIER_DELIMITER = "\""
21
24
  _SCHEMA_QUERY = """select column_name, case when (data_precision is not null
22
25
  and data_scale <> 0)
23
26
  then data_type || '(' || data_precision || ',' || data_scale || ')'
@@ -91,7 +94,7 @@ class OracleDataSource(DataSource, SecretsMixin, JDBCReaderMixin):
91
94
  schema_metadata = df.select([col(c).alias(c.lower()) for c in df.columns]).collect()
92
95
  logger.info(f"Schema fetched successfully. Completed at: {datetime.now()}")
93
96
  logger.debug(f"schema_metadata: ${schema_metadata}")
94
- return [Schema(field.column_name.lower(), field.data_type.lower()) for field in schema_metadata]
97
+ return [self._map_meta_column(field) for field in schema_metadata]
95
98
  except (RuntimeError, PySparkException) as e:
96
99
  return self.log_and_throw_exception(e, "schema", schema_query)
97
100
 
@@ -106,3 +109,10 @@ class OracleDataSource(DataSource, SecretsMixin, JDBCReaderMixin):
106
109
 
107
110
  def reader(self, query: str) -> DataFrameReader:
108
111
  return self._get_jdbc_reader(query, self.get_jdbc_url, OracleDataSource._DRIVER)
112
+
113
+ def normalize_identifier(self, identifier: str) -> NormalizedIdentifier:
114
+ return DialectUtils.normalize_identifier(
115
+ identifier,
116
+ source_start_delimiter=OracleDataSource._IDENTIFIER_DELIMITER,
117
+ source_end_delimiter=OracleDataSource._IDENTIFIER_DELIMITER,
118
+ )
@@ -11,7 +11,9 @@ from cryptography.hazmat.primitives import serialization
11
11
 
12
12
  from databricks.labs.lakebridge.reconcile.connectors.data_source import DataSource
13
13
  from databricks.labs.lakebridge.reconcile.connectors.jdbc_reader import JDBCReaderMixin
14
+ from databricks.labs.lakebridge.reconcile.connectors.models import NormalizedIdentifier
14
15
  from databricks.labs.lakebridge.reconcile.connectors.secrets import SecretsMixin
16
+ from databricks.labs.lakebridge.reconcile.connectors.dialect_utils import DialectUtils
15
17
  from databricks.labs.lakebridge.reconcile.exception import InvalidSnowflakePemPrivateKey
16
18
  from databricks.labs.lakebridge.reconcile.recon_config import JdbcReaderOptions, Schema
17
19
  from databricks.sdk import WorkspaceClient
@@ -22,6 +24,8 @@ logger = logging.getLogger(__name__)
22
24
 
23
25
  class SnowflakeDataSource(DataSource, SecretsMixin, JDBCReaderMixin):
24
26
  _DRIVER = "snowflake"
27
+ _IDENTIFIER_DELIMITER = "\""
28
+
25
29
  """
26
30
  * INFORMATION_SCHEMA:
27
31
  - see https://docs.snowflake.com/en/sql-reference/info-schema#considerations-for-replacing-show-commands-with-information-schema-views
@@ -144,9 +148,10 @@ class SnowflakeDataSource(DataSource, SecretsMixin, JDBCReaderMixin):
144
148
  try:
145
149
  logger.debug(f"Fetching schema using query: \n`{schema_query}`")
146
150
  logger.info(f"Fetching Schema: Started at: {datetime.now()}")
147
- schema_metadata = self.reader(schema_query).load().collect()
151
+ df = self.reader(schema_query).load()
152
+ schema_metadata = df.select([col(c).alias(c.lower()) for c in df.columns]).collect()
148
153
  logger.info(f"Schema fetched successfully. Completed at: {datetime.now()}")
149
- return [Schema(field.COLUMN_NAME.lower(), field.DATA_TYPE.lower()) for field in schema_metadata]
154
+ return [self._map_meta_column(field) for field in schema_metadata]
150
155
  except (RuntimeError, PySparkException) as e:
151
156
  return self.log_and_throw_exception(e, "schema", schema_query)
152
157
 
@@ -171,3 +176,10 @@ class SnowflakeDataSource(DataSource, SecretsMixin, JDBCReaderMixin):
171
176
  raise NotFound(message) from e
172
177
 
173
178
  return self._spark.read.format("snowflake").option("dbtable", f"({query}) as tmp").options(**options)
179
+
180
+ def normalize_identifier(self, identifier: str) -> NormalizedIdentifier:
181
+ return DialectUtils.normalize_identifier(
182
+ identifier,
183
+ source_start_delimiter=SnowflakeDataSource._IDENTIFIER_DELIMITER,
184
+ source_end_delimiter=SnowflakeDataSource._IDENTIFIER_DELIMITER,
185
+ )
@@ -9,7 +9,9 @@ from sqlglot import Dialect
9
9
 
10
10
  from databricks.labs.lakebridge.reconcile.connectors.data_source import DataSource
11
11
  from databricks.labs.lakebridge.reconcile.connectors.jdbc_reader import JDBCReaderMixin
12
+ from databricks.labs.lakebridge.reconcile.connectors.models import NormalizedIdentifier
12
13
  from databricks.labs.lakebridge.reconcile.connectors.secrets import SecretsMixin
14
+ from databricks.labs.lakebridge.reconcile.connectors.dialect_utils import DialectUtils
13
15
  from databricks.labs.lakebridge.reconcile.recon_config import JdbcReaderOptions, Schema
14
16
  from databricks.sdk import WorkspaceClient
15
17
 
@@ -49,6 +51,7 @@ _SCHEMA_QUERY = """SELECT
49
51
 
50
52
  class TSQLServerDataSource(DataSource, SecretsMixin, JDBCReaderMixin):
51
53
  _DRIVER = "sqlserver"
54
+ _IDENTIFIER_DELIMITER = {"prefix": "[", "suffix": "]"}
52
55
 
53
56
  def __init__(
54
57
  self,
@@ -122,11 +125,33 @@ class TSQLServerDataSource(DataSource, SecretsMixin, JDBCReaderMixin):
122
125
  try:
123
126
  logger.debug(f"Fetching schema using query: \n`{schema_query}`")
124
127
  logger.info(f"Fetching Schema: Started at: {datetime.now()}")
125
- schema_metadata = self.reader(schema_query).load().collect()
128
+ df = self.reader(schema_query).load()
129
+ schema_metadata = df.select([col(c).alias(c.lower()) for c in df.columns]).collect()
126
130
  logger.info(f"Schema fetched successfully. Completed at: {datetime.now()}")
127
- return [Schema(field.COLUMN_NAME.lower(), field.DATA_TYPE.lower()) for field in schema_metadata]
131
+ return [self._map_meta_column(field) for field in schema_metadata]
128
132
  except (RuntimeError, PySparkException) as e:
129
133
  return self.log_and_throw_exception(e, "schema", schema_query)
130
134
 
131
135
  def reader(self, query: str, prepare_query_str="") -> DataFrameReader:
132
136
  return self._get_jdbc_reader(query, self.get_jdbc_url, self._DRIVER, prepare_query_str)
137
+
138
+ def normalize_identifier(self, identifier: str) -> NormalizedIdentifier:
139
+ return DialectUtils.normalize_identifier(
140
+ TSQLServerDataSource._normalize_quotes(identifier),
141
+ source_start_delimiter=TSQLServerDataSource._IDENTIFIER_DELIMITER["prefix"],
142
+ source_end_delimiter=TSQLServerDataSource._IDENTIFIER_DELIMITER["suffix"],
143
+ )
144
+
145
+ @staticmethod
146
+ def _normalize_quotes(identifier: str):
147
+ if DialectUtils.is_already_delimited(identifier, '"', '"'):
148
+ identifier = identifier[1:-1]
149
+ identifier = identifier.replace('""', '"')
150
+ identifier = (
151
+ TSQLServerDataSource._IDENTIFIER_DELIMITER["prefix"]
152
+ + identifier
153
+ + TSQLServerDataSource._IDENTIFIER_DELIMITER["suffix"]
154
+ )
155
+ return identifier
156
+
157
+ return identifier
@@ -15,10 +15,11 @@ class AutoName(Enum):
15
15
 
16
16
 
17
17
  class ReconSourceType(AutoName):
18
- SNOWFLAKE = auto()
19
- ORACLE = auto()
20
18
  DATABRICKS = auto()
21
- TSQL = auto()
19
+ MSSQL = auto()
20
+ ORACLE = auto()
21
+ SNOWFLAKE = auto()
22
+ SYNAPSE = auto()
22
23
 
23
24
 
24
25
  class ReconReportType(AutoName):