databricks-labs-lakebridge 0.10.6__py3-none-any.whl → 0.10.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. databricks/labs/lakebridge/__about__.py +1 -1
  2. databricks/labs/lakebridge/analyzer/__init__.py +0 -0
  3. databricks/labs/lakebridge/analyzer/lakebridge_analyzer.py +95 -0
  4. databricks/labs/lakebridge/assessments/profiler_validator.py +103 -0
  5. databricks/labs/lakebridge/base_install.py +20 -3
  6. databricks/labs/lakebridge/cli.py +32 -59
  7. databricks/labs/lakebridge/contexts/application.py +7 -0
  8. databricks/labs/lakebridge/deployment/job.py +2 -2
  9. databricks/labs/lakebridge/helpers/file_utils.py +36 -0
  10. databricks/labs/lakebridge/helpers/validation.py +5 -3
  11. databricks/labs/lakebridge/install.py +73 -484
  12. databricks/labs/lakebridge/reconcile/compare.py +70 -33
  13. databricks/labs/lakebridge/reconcile/connectors/data_source.py +24 -1
  14. databricks/labs/lakebridge/reconcile/connectors/databricks.py +12 -1
  15. databricks/labs/lakebridge/reconcile/connectors/dialect_utils.py +126 -0
  16. databricks/labs/lakebridge/reconcile/connectors/models.py +7 -0
  17. databricks/labs/lakebridge/reconcile/connectors/oracle.py +12 -1
  18. databricks/labs/lakebridge/reconcile/connectors/secrets.py +19 -1
  19. databricks/labs/lakebridge/reconcile/connectors/snowflake.py +63 -30
  20. databricks/labs/lakebridge/reconcile/connectors/tsql.py +28 -2
  21. databricks/labs/lakebridge/reconcile/constants.py +4 -3
  22. databricks/labs/lakebridge/reconcile/execute.py +9 -810
  23. databricks/labs/lakebridge/reconcile/normalize_recon_config_service.py +133 -0
  24. databricks/labs/lakebridge/reconcile/query_builder/base.py +53 -18
  25. databricks/labs/lakebridge/reconcile/query_builder/expression_generator.py +8 -2
  26. databricks/labs/lakebridge/reconcile/query_builder/hash_query.py +7 -13
  27. databricks/labs/lakebridge/reconcile/query_builder/sampling_query.py +18 -19
  28. databricks/labs/lakebridge/reconcile/query_builder/threshold_query.py +36 -15
  29. databricks/labs/lakebridge/reconcile/recon_config.py +3 -15
  30. databricks/labs/lakebridge/reconcile/recon_output_config.py +2 -1
  31. databricks/labs/lakebridge/reconcile/reconciliation.py +511 -0
  32. databricks/labs/lakebridge/reconcile/schema_compare.py +26 -19
  33. databricks/labs/lakebridge/reconcile/trigger_recon_aggregate_service.py +78 -0
  34. databricks/labs/lakebridge/reconcile/trigger_recon_service.py +256 -0
  35. databricks/labs/lakebridge/reconcile/utils.py +38 -0
  36. databricks/labs/lakebridge/transpiler/execute.py +34 -28
  37. databricks/labs/lakebridge/transpiler/installers.py +523 -0
  38. databricks/labs/lakebridge/transpiler/lsp/lsp_engine.py +47 -60
  39. databricks/labs/lakebridge/transpiler/sqlglot/dialect_utils.py +2 -0
  40. databricks/labs/lakebridge/transpiler/transpile_engine.py +0 -18
  41. {databricks_labs_lakebridge-0.10.6.dist-info → databricks_labs_lakebridge-0.10.8.dist-info}/METADATA +1 -1
  42. {databricks_labs_lakebridge-0.10.6.dist-info → databricks_labs_lakebridge-0.10.8.dist-info}/RECORD +46 -35
  43. {databricks_labs_lakebridge-0.10.6.dist-info → databricks_labs_lakebridge-0.10.8.dist-info}/WHEEL +0 -0
  44. {databricks_labs_lakebridge-0.10.6.dist-info → databricks_labs_lakebridge-0.10.8.dist-info}/entry_points.txt +0 -0
  45. {databricks_labs_lakebridge-0.10.6.dist-info → databricks_labs_lakebridge-0.10.8.dist-info}/licenses/LICENSE +0 -0
  46. {databricks_labs_lakebridge-0.10.6.dist-info → databricks_labs_lakebridge-0.10.8.dist-info}/licenses/NOTICE +0 -0
@@ -0,0 +1,133 @@
1
+ import dataclasses
2
+
3
+ from databricks.labs.lakebridge.reconcile.connectors.data_source import DataSource
4
+ from databricks.labs.lakebridge.reconcile.recon_config import (
5
+ Table,
6
+ Aggregate,
7
+ ColumnMapping,
8
+ Transformation,
9
+ ColumnThresholds,
10
+ )
11
+
12
+
13
+ class NormalizeReconConfigService:
14
+ def __init__(self, source: DataSource, target: DataSource):
15
+ self.source = source
16
+ self.target = target
17
+
18
+ def normalize_recon_table_config(self, table: Table) -> Table:
19
+ normalized_table = dataclasses.replace(table)
20
+
21
+ self._normalize_sampling(normalized_table)
22
+ self._normalize_aggs(normalized_table)
23
+ self._normalize_join_cols(normalized_table)
24
+ self._normalize_select_cols(normalized_table)
25
+ self._normalize_drop_cols(normalized_table)
26
+ self._normalize_col_mappings(normalized_table)
27
+ self._normalize_transformations(normalized_table)
28
+ self._normalize_col_thresholds(normalized_table)
29
+ self._normalize_jdbc_options(normalized_table)
30
+
31
+ return normalized_table
32
+
33
+ def _normalize_sampling(self, table: Table):
34
+ if table.sampling_options:
35
+ normalized_sampling = dataclasses.replace(table.sampling_options)
36
+ normalized_sampling.stratified_columns = (
37
+ [self.source.normalize_identifier(c).ansi_normalized for c in normalized_sampling.stratified_columns]
38
+ if normalized_sampling.stratified_columns
39
+ else None
40
+ )
41
+ table.sampling_options = normalized_sampling
42
+ return table
43
+
44
+ def _normalize_aggs(self, table: Table):
45
+ normalized = [self._normalize_agg(a) for a in table.aggregates] if table.aggregates else None
46
+ table.aggregates = normalized
47
+ return table
48
+
49
+ def _normalize_agg(self, agg: Aggregate) -> Aggregate:
50
+ normalized = dataclasses.replace(agg)
51
+ normalized.agg_columns = [self.source.normalize_identifier(c).ansi_normalized for c in normalized.agg_columns]
52
+ normalized.group_by_columns = (
53
+ [self.source.normalize_identifier(c).ansi_normalized for c in normalized.group_by_columns]
54
+ if normalized.group_by_columns
55
+ else None
56
+ )
57
+ return normalized
58
+
59
+ def _normalize_join_cols(self, table: Table):
60
+ table.join_columns = (
61
+ [self.source.normalize_identifier(c).ansi_normalized for c in table.join_columns]
62
+ if table.join_columns
63
+ else None
64
+ )
65
+ return table
66
+
67
+ def _normalize_select_cols(self, table: Table):
68
+ table.select_columns = (
69
+ [self.source.normalize_identifier(c).ansi_normalized for c in table.select_columns]
70
+ if table.select_columns
71
+ else None
72
+ )
73
+ return table
74
+
75
+ def _normalize_drop_cols(self, table: Table):
76
+ table.drop_columns = (
77
+ [self.source.normalize_identifier(c).ansi_normalized for c in table.drop_columns]
78
+ if table.drop_columns
79
+ else None
80
+ )
81
+ return table
82
+
83
+ def _normalize_col_mappings(self, table: Table):
84
+ table.column_mapping = (
85
+ [self._normalize_col_mapping(m) for m in table.column_mapping] if table.column_mapping else None
86
+ )
87
+ return table
88
+
89
+ def _normalize_col_mapping(self, mapping: ColumnMapping):
90
+ return ColumnMapping(
91
+ source_name=self.source.normalize_identifier(mapping.source_name).ansi_normalized,
92
+ target_name=self.target.normalize_identifier(mapping.target_name).ansi_normalized,
93
+ )
94
+
95
+ def _normalize_transformations(self, table: Table):
96
+ table.transformations = (
97
+ [self._normalize_transformation(t) for t in table.transformations] if table.transformations else None
98
+ )
99
+ return table
100
+
101
+ def _normalize_transformation(self, transform: Transformation):
102
+ """normalize user-configured transformations
103
+
104
+ The user configures the table column and passes SQL code to transform the source table and target table.
105
+ This is useful in scenarios when the data changes e.g. migrating `datetime`s. The SQL code is not normalized
106
+ and it is the user responsibility to pass valid SQL respecting source database and target database.
107
+ """
108
+ normalized = dataclasses.replace(transform)
109
+ normalized.column_name = self.source.normalize_identifier(transform.column_name).ansi_normalized
110
+ return normalized
111
+
112
+ def _normalize_col_thresholds(self, table: Table):
113
+ table.column_thresholds = (
114
+ [self._normalize_col_threshold(t) for t in table.column_thresholds] if table.column_thresholds else None
115
+ )
116
+ return table
117
+
118
+ def _normalize_col_threshold(self, threshold: ColumnThresholds):
119
+ normalized = dataclasses.replace(threshold)
120
+ normalized.column_name = self.source.normalize_identifier(threshold.column_name).ansi_normalized
121
+ return normalized
122
+
123
+ def _normalize_jdbc_options(self, table: Table):
124
+ if table.jdbc_reader_options:
125
+ normalized = dataclasses.replace(table.jdbc_reader_options)
126
+ normalized.partition_column = (
127
+ self.source.normalize_identifier(normalized.partition_column).ansi_normalized
128
+ if normalized.partition_column
129
+ else None
130
+ )
131
+ table.jdbc_reader_options = normalized
132
+
133
+ return table
@@ -4,10 +4,13 @@ from abc import ABC
4
4
  import sqlglot.expressions as exp
5
5
  from sqlglot import Dialect, parse_one
6
6
 
7
+ from databricks.labs.lakebridge.reconcile.connectors.data_source import DataSource
8
+ from databricks.labs.lakebridge.reconcile.connectors.dialect_utils import DialectUtils
7
9
  from databricks.labs.lakebridge.reconcile.exception import InvalidInputException
8
10
  from databricks.labs.lakebridge.reconcile.query_builder.expression_generator import (
9
11
  DataType_transform_mapping,
10
12
  transform_expression,
13
+ build_column,
11
14
  )
12
15
  from databricks.labs.lakebridge.reconcile.recon_config import Schema, Table, Aggregate
13
16
  from databricks.labs.lakebridge.transpiler.sqlglot.dialect_utils import get_dialect, SQLGLOT_DIALECTS
@@ -16,21 +19,16 @@ logger = logging.getLogger(__name__)
16
19
 
17
20
 
18
21
  class QueryBuilder(ABC):
19
- def __init__(
20
- self,
21
- table_conf: Table,
22
- schema: list[Schema],
23
- layer: str,
24
- engine: Dialect,
25
- ):
22
+ def __init__(self, table_conf: Table, schema: list[Schema], layer: str, engine: Dialect, data_source: DataSource):
26
23
  self._table_conf = table_conf
27
24
  self._schema = schema
28
25
  self._layer = layer
29
26
  self._engine = engine
27
+ self._data_source = data_source
30
28
 
31
29
  @property
32
30
  def engine(self) -> Dialect:
33
- return self._engine
31
+ return self._engine if self.layer == "source" else get_dialect("databricks")
34
32
 
35
33
  @property
36
34
  def layer(self) -> str:
@@ -70,7 +68,25 @@ class QueryBuilder(ABC):
70
68
 
71
69
  @property
72
70
  def user_transformations(self) -> dict[str, str]:
73
- return self._table_conf.get_transformation_dict(self._layer)
71
+ if self._table_conf.transformations:
72
+ if self.layer == "source":
73
+ return {
74
+ trans.column_name: (
75
+ trans.source
76
+ if trans.source
77
+ else self._data_source.normalize_identifier(trans.column_name).source_normalized
78
+ )
79
+ for trans in self._table_conf.transformations
80
+ }
81
+ return {
82
+ self._table_conf.get_layer_src_to_tgt_col_mapping(trans.column_name, self.layer): (
83
+ trans.target
84
+ if trans.target
85
+ else self._table_conf.get_layer_src_to_tgt_col_mapping(trans.column_name, self.layer)
86
+ )
87
+ for trans in self._table_conf.transformations
88
+ }
89
+ return {}
74
90
 
75
91
  @property
76
92
  def aggregates(self) -> list[Aggregate] | None:
@@ -93,10 +109,12 @@ class QueryBuilder(ABC):
93
109
 
94
110
  def _user_transformer(self, node: exp.Expression, user_transformations: dict[str, str]) -> exp.Expression:
95
111
  if isinstance(node, exp.Column) and user_transformations:
96
- dialect = self.engine if self.layer == "source" else get_dialect("databricks")
97
- column_name = node.name
98
- if column_name in user_transformations.keys():
99
- return parse_one(user_transformations.get(column_name, column_name), read=dialect)
112
+ normalized_column = self._data_source.normalize_identifier(node.name)
113
+ ansi_name = normalized_column.ansi_normalized
114
+ if ansi_name in user_transformations.keys():
115
+ return parse_one(
116
+ user_transformations.get(ansi_name, normalized_column.source_normalized), read=self.engine
117
+ )
100
118
  return node
101
119
 
102
120
  def _apply_default_transformation(
@@ -107,8 +125,7 @@ class QueryBuilder(ABC):
107
125
  with_transform.append(alias.transform(self._default_transformer, schema, source))
108
126
  return with_transform
109
127
 
110
- @staticmethod
111
- def _default_transformer(node: exp.Expression, schema: list[Schema], source: Dialect) -> exp.Expression:
128
+ def _default_transformer(self, node: exp.Expression, schema: list[Schema], source: Dialect) -> exp.Expression:
112
129
 
113
130
  def _get_transform(datatype: str):
114
131
  source_dialects = [source_key for source_key, dialect in SQLGLOT_DIALECTS.items() if dialect == source]
@@ -125,9 +142,10 @@ class QueryBuilder(ABC):
125
142
 
126
143
  schema_dict = {v.column_name: v.data_type for v in schema}
127
144
  if isinstance(node, exp.Column):
128
- column_name = node.name
129
- if column_name in schema_dict.keys():
130
- transform = _get_transform(schema_dict.get(column_name, column_name))
145
+ normalized_column = self._data_source.normalize_identifier(node.name)
146
+ ansi_name = normalized_column.ansi_normalized
147
+ if ansi_name in schema_dict.keys():
148
+ transform = _get_transform(schema_dict.get(ansi_name, normalized_column.source_normalized))
131
149
  return transform_expression(node, transform)
132
150
  return node
133
151
 
@@ -136,3 +154,20 @@ class QueryBuilder(ABC):
136
154
  message = f"Exception for {self.table_conf.target_name} target table in {self.layer} layer --> {message}"
137
155
  logger.error(message)
138
156
  raise InvalidInputException(message)
157
+
158
+ def _build_column_with_alias(self, column: str):
159
+ return build_column(
160
+ this=self._build_column_name_source_normalized(column),
161
+ alias=DialectUtils.unnormalize_identifier(
162
+ self.table_conf.get_layer_tgt_to_src_col_mapping(column, self.layer)
163
+ ),
164
+ quoted=True,
165
+ )
166
+
167
+ def _build_column_name_source_normalized(self, column: str):
168
+ return self._data_source.normalize_identifier(column).source_normalized
169
+
170
+ def _build_alias_source_normalized(self, column: str):
171
+ return self._data_source.normalize_identifier(
172
+ self.table_conf.get_layer_tgt_to_src_col_mapping(column, self.layer)
173
+ ).source_normalized
@@ -125,6 +125,7 @@ def anonymous(expr: exp.Column, func: str, is_expr: bool = False, dialect=None)
125
125
  return new_expr
126
126
 
127
127
 
128
+ # TODO Standardize impl and use quoted and Identifier/Column consistently
128
129
  def build_column(this: exp.ExpOrStr, table_name="", quoted=False, alias=None) -> exp.Expression:
129
130
  if alias:
130
131
  if isinstance(this, str):
@@ -135,6 +136,10 @@ def build_column(this: exp.ExpOrStr, table_name="", quoted=False, alias=None) ->
135
136
  return exp.Column(this=exp.Identifier(this=this, quoted=quoted), table=table_name)
136
137
 
137
138
 
139
+ def build_column_no_alias(this: str, table_name="") -> exp.Expression:
140
+ return exp.Column(this=this, table=table_name)
141
+
142
+
138
143
  def build_literal(this: exp.ExpOrStr, alias=None, quoted=False, is_string=True, cast=None) -> exp.Expression:
139
144
  base_literal = exp.Literal(this=this, is_string=is_string)
140
145
  if not cast and not alias:
@@ -207,10 +212,11 @@ def build_sub(
207
212
  right_column_name: str,
208
213
  left_table_name: str | None = None,
209
214
  right_table_name: str | None = None,
215
+ quoted: bool = False,
210
216
  ) -> exp.Sub:
211
217
  return exp.Sub(
212
- this=build_column(left_column_name, left_table_name),
213
- expression=build_column(right_column_name, right_table_name),
218
+ this=build_column(left_column_name, left_table_name, quoted=quoted),
219
+ expression=build_column(right_column_name, right_table_name, quoted=quoted),
214
220
  )
215
221
 
216
222
 
@@ -11,8 +11,8 @@ from databricks.labs.lakebridge.reconcile.query_builder.expression_generator imp
11
11
  get_hash_transform,
12
12
  lower,
13
13
  transform_expression,
14
+ build_column_no_alias,
14
15
  )
15
- from databricks.labs.lakebridge.transpiler.sqlglot.dialect_utils import get_dialect
16
16
 
17
17
  logger = logging.getLogger(__name__)
18
18
 
@@ -41,15 +41,12 @@ class HashQueryBuilder(QueryBuilder):
41
41
 
42
42
  key_cols = hash_cols if report_type == "row" else sorted(_join_columns | self.partition_column)
43
43
 
44
- cols_with_alias = [
45
- build_column(this=col, alias=self.table_conf.get_layer_tgt_to_src_col_mapping(col, self.layer))
46
- for col in key_cols
47
- ]
44
+ cols_with_alias = [self._build_column_with_alias(col) for col in key_cols]
48
45
 
49
46
  # in case if we have column mapping, we need to sort the target columns in the order of source columns to get
50
47
  # same hash value
51
48
  hash_cols_with_alias = [
52
- {"this": col, "alias": self.table_conf.get_layer_tgt_to_src_col_mapping(col, self.layer)}
49
+ {"this": self._build_column_name_source_normalized(col), "alias": self._build_alias_source_normalized(col)}
53
50
  for col in hash_cols
54
51
  ]
55
52
  sorted_hash_cols_with_alias = sorted(hash_cols_with_alias, key=lambda column: column["alias"])
@@ -60,12 +57,11 @@ class HashQueryBuilder(QueryBuilder):
60
57
  )
61
58
  hash_col_with_transform = [self._generate_hash_algorithm(hashcols_sorted_as_src_seq, _HASH_COLUMN_NAME)]
62
59
 
63
- dialect = self.engine if self.layer == "source" else get_dialect("databricks")
64
60
  res = (
65
61
  exp.select(*hash_col_with_transform + key_cols_with_transform)
66
62
  .from_(":tbl")
67
- .where(self.filter)
68
- .sql(dialect=dialect)
63
+ .where(self.filter, dialect=self.engine)
64
+ .sql(dialect=self.engine)
69
65
  )
70
66
 
71
67
  logger.info(f"Hash Query for {self.layer}: {res}")
@@ -76,10 +72,8 @@ class HashQueryBuilder(QueryBuilder):
76
72
  cols: list[str],
77
73
  column_alias: str,
78
74
  ) -> exp.Expression:
79
- cols_with_alias = [build_column(this=col, alias=None) for col in cols]
80
- cols_with_transform = self.add_transformations(
81
- cols_with_alias, self.engine if self.layer == "source" else get_dialect("databricks")
82
- )
75
+ cols_no_alias = [build_column_no_alias(this=col) for col in cols]
76
+ cols_with_transform = self.add_transformations(cols_no_alias, self.engine)
83
77
  col_exprs = exp.select(*cols_with_transform).iter_expressions()
84
78
  concat_expr = concat(list(col_exprs))
85
79
 
@@ -4,6 +4,7 @@ import sqlglot.expressions as exp
4
4
  from pyspark.sql import DataFrame
5
5
  from sqlglot import select
6
6
 
7
+ from databricks.labs.lakebridge.reconcile.connectors.dialect_utils import DialectUtils
7
8
  from databricks.labs.lakebridge.transpiler.sqlglot.dialect_utils import get_key_from_dialect
8
9
  from databricks.labs.lakebridge.reconcile.query_builder.base import QueryBuilder
9
10
  from databricks.labs.lakebridge.reconcile.query_builder.expression_generator import (
@@ -37,12 +38,9 @@ class SamplingQueryBuilder(QueryBuilder):
37
38
 
38
39
  cols = sorted((join_columns | self.select_columns) - self.threshold_columns - self.drop_columns)
39
40
 
40
- cols_with_alias = [
41
- build_column(this=col, alias=self.table_conf.get_layer_tgt_to_src_col_mapping(col, self.layer))
42
- for col in cols
43
- ]
41
+ cols_with_alias = [self._build_column_with_alias(col) for col in cols]
44
42
 
45
- query = select(*cols_with_alias).from_(":tbl").where(self.filter).sql(dialect=self.engine)
43
+ query = select(*cols_with_alias).from_(":tbl").where(self.filter, dialect=self.engine).sql(dialect=self.engine)
46
44
 
47
45
  logger.info(f"Sampling Query with Alias for {self.layer}: {query}")
48
46
  return query
@@ -59,22 +57,22 @@ class SamplingQueryBuilder(QueryBuilder):
59
57
 
60
58
  cols = sorted((join_columns | self.select_columns) - self.threshold_columns - self.drop_columns)
61
59
 
62
- cols_with_alias = [
63
- build_column(this=col, alias=self.table_conf.get_layer_tgt_to_src_col_mapping(col, self.layer))
64
- for col in cols
65
- ]
60
+ cols_with_alias = [self._build_column_with_alias(col) for col in cols]
66
61
 
67
62
  sql_with_transforms = self.add_transformations(cols_with_alias, self.engine)
68
- query_sql = select(*sql_with_transforms).from_(":tbl").where(self.filter)
63
+ query_sql = select(*sql_with_transforms).from_(":tbl").where(self.filter, dialect=self.engine)
69
64
  if self.layer == "source":
70
- with_select = [build_column(this=col, table_name="src") for col in sorted(cols)]
65
+ with_select = [
66
+ build_column(this=DialectUtils.unnormalize_identifier(col), table_name="src", quoted=True)
67
+ for col in sorted(cols)
68
+ ]
71
69
  else:
72
70
  with_select = [
73
- build_column(this=col, table_name="src")
71
+ build_column(this=DialectUtils.unnormalize_identifier(col), table_name="src", quoted=True)
74
72
  for col in sorted(self.table_conf.get_tgt_to_src_col_mapping_list(cols))
75
73
  ]
76
74
 
77
- join_clause = SamplingQueryBuilder._get_join_clause(key_cols)
75
+ join_clause = self._get_join_clause(key_cols)
78
76
 
79
77
  query = (
80
78
  with_clause.with_(alias="src", as_=query_sql)
@@ -86,10 +84,10 @@ class SamplingQueryBuilder(QueryBuilder):
86
84
  logger.info(f"Sampling Query for {self.layer}: {query}")
87
85
  return query
88
86
 
89
- @classmethod
90
- def _get_join_clause(cls, key_cols: list):
87
+ def _get_join_clause(self, key_cols: list):
88
+ normalized = [self._build_column_name_source_normalized(col) for col in key_cols]
91
89
  return build_join_clause(
92
- "recon", key_cols, source_table_alias="src", target_table_alias="recon", kind="inner", func=exp.EQ
90
+ "recon", normalized, source_table_alias="src", target_table_alias="recon", kind="inner", func=exp.EQ
93
91
  )
94
92
 
95
93
  def _get_with_clause(self, df: DataFrame) -> exp.Select:
@@ -106,12 +104,13 @@ class SamplingQueryBuilder(QueryBuilder):
106
104
  (
107
105
  build_literal(
108
106
  this=str(value),
109
- alias=col,
107
+ alias=DialectUtils.unnormalize_identifier(col),
110
108
  is_string=_get_is_string(column_types_dict, col),
111
- cast=orig_types_dict.get(col),
109
+ cast=orig_types_dict.get(DialectUtils.ansi_normalize_identifier(col)),
110
+ quoted=True,
112
111
  )
113
112
  if value is not None
114
- else exp.Alias(this=exp.Null(), alias=col)
113
+ else exp.Alias(this=exp.Null(), alias=DialectUtils.unnormalize_identifier(col), quoted=True)
115
114
  )
116
115
  for col, value in zip(df.columns, row)
117
116
  ]
@@ -3,6 +3,7 @@ import logging
3
3
  from sqlglot import expressions as exp
4
4
  from sqlglot import select
5
5
 
6
+ from databricks.labs.lakebridge.reconcile.connectors.dialect_utils import DialectUtils
6
7
  from databricks.labs.lakebridge.reconcile.query_builder.base import QueryBuilder
7
8
  from databricks.labs.lakebridge.reconcile.query_builder.expression_generator import (
8
9
  anonymous,
@@ -54,6 +55,7 @@ class ThresholdQueryBuilder(QueryBuilder):
54
55
  left_table_name="source",
55
56
  right_column_name=column,
56
57
  right_table_name="databricks",
58
+ quoted=False,
57
59
  )
58
60
  ).transform(coalesce)
59
61
 
@@ -62,7 +64,14 @@ class ThresholdQueryBuilder(QueryBuilder):
62
64
  where_clause.append(where)
63
65
  # join columns
64
66
  for column in sorted(join_columns):
65
- select_clause.append(build_column(this=column, alias=f"{column}_source", table_name="source"))
67
+ select_clause.append(
68
+ build_column(
69
+ this=column,
70
+ alias=f"{DialectUtils.unnormalize_identifier(column)}_source",
71
+ table_name="source",
72
+ quoted=True,
73
+ )
74
+ )
66
75
  where = build_where_clause(where_clause)
67
76
 
68
77
  return select_clause, where
@@ -76,10 +85,20 @@ class ThresholdQueryBuilder(QueryBuilder):
76
85
  select_clause = []
77
86
  column = threshold.column_name
78
87
  select_clause.append(
79
- build_column(this=column, alias=f"{column}_source", table_name="source").transform(coalesce)
88
+ build_column(
89
+ this=column,
90
+ alias=f"{DialectUtils.unnormalize_identifier(column)}_source",
91
+ table_name="source",
92
+ quoted=True,
93
+ ).transform(coalesce)
80
94
  )
81
95
  select_clause.append(
82
- build_column(this=column, alias=f"{column}_databricks", table_name="databricks").transform(coalesce)
96
+ build_column(
97
+ this=column,
98
+ alias=f"{DialectUtils.unnormalize_identifier(column)}_databricks",
99
+ table_name="databricks",
100
+ quoted=True,
101
+ ).transform(coalesce)
83
102
  )
84
103
  where_clause = exp.NEQ(this=base, expression=exp.Literal(this="0", is_string=False))
85
104
  return select_clause, where_clause
@@ -110,7 +129,13 @@ class ThresholdQueryBuilder(QueryBuilder):
110
129
  logger.error(error_message)
111
130
  raise ValueError(error_message)
112
131
 
113
- select_clause.append(build_column(this=func(base=base, threshold=threshold), alias=f"{column}_match"))
132
+ select_clause.append(
133
+ build_column(
134
+ this=func(base=base, threshold=threshold),
135
+ alias=f"{DialectUtils.unnormalize_identifier(column)}_match",
136
+ quoted=True,
137
+ )
138
+ )
114
139
 
115
140
  return select_clause, where_clause
116
141
 
@@ -170,8 +195,8 @@ class ThresholdQueryBuilder(QueryBuilder):
170
195
  ),
171
196
  expression=exp.Is(
172
197
  this=exp.Column(
173
- this=exp.Identifier(this=threshold.column_name, quoted=False),
174
- table=exp.Identifier(this='databricks'),
198
+ this=threshold.column_name,
199
+ table="databricks",
175
200
  ),
176
201
  expression=exp.Null(),
177
202
  ),
@@ -211,21 +236,17 @@ class ThresholdQueryBuilder(QueryBuilder):
211
236
  self._validate(self.join_columns, "Join Columns are compulsory for threshold query")
212
237
  join_columns = self.join_columns if self.join_columns else set()
213
238
  keys: list[str] = sorted(self.partition_column.union(join_columns))
214
- keys_select_alias = [
215
- build_column(this=col, alias=self.table_conf.get_layer_tgt_to_src_col_mapping(col, self.layer))
216
- for col in keys
217
- ]
239
+ keys_select_alias = [self._build_column_with_alias(col) for col in keys]
218
240
  keys_expr = self._apply_user_transformation(keys_select_alias)
219
241
 
220
242
  # threshold column expression
221
- threshold_alias = [
222
- build_column(this=col, alias=self.table_conf.get_layer_tgt_to_src_col_mapping(col, self.layer))
223
- for col in sorted(self.threshold_columns)
224
- ]
243
+ threshold_alias = [self._build_column_with_alias(col) for col in sorted(self.threshold_columns)]
225
244
  thresholds_expr = threshold_alias
226
245
  if self.user_transformations:
227
246
  thresholds_expr = self._apply_user_transformation(threshold_alias)
228
247
 
229
- query = (select(*keys_expr + thresholds_expr).from_(":tbl").where(self.filter)).sql(dialect=self.engine)
248
+ query = (select(*keys_expr + thresholds_expr).from_(":tbl").where(self.filter, dialect=self.engine)).sql(
249
+ dialect=self.engine
250
+ )
230
251
  logger.info(f"Threshold Query for {self.layer}: {query}")
231
252
  return query
@@ -257,21 +257,6 @@ class Table:
257
257
  return set()
258
258
  return {self.get_layer_src_to_tgt_col_mapping(col, layer) for col in self.drop_columns}
259
259
 
260
- def get_transformation_dict(self, layer: str) -> dict[str, str]:
261
- if self.transformations:
262
- if layer == "source":
263
- return {
264
- trans.column_name: (trans.source if trans.source else trans.column_name)
265
- for trans in self.transformations
266
- }
267
- return {
268
- self.get_layer_src_to_tgt_col_mapping(trans.column_name, layer): (
269
- trans.target if trans.target else self.get_layer_src_to_tgt_col_mapping(trans.column_name, layer)
270
- )
271
- for trans in self.transformations
272
- }
273
- return {}
274
-
275
260
  def get_partition_column(self, layer: str) -> set[str]:
276
261
  if self.jdbc_reader_options and layer == "source":
277
262
  if self.jdbc_reader_options.partition_column:
@@ -288,8 +273,11 @@ class Table:
288
273
 
289
274
  @dataclass
290
275
  class Schema:
276
+ # TODO remove: This will have the value of ansi_normalized_column_name. Kept for backwards compatibility.
291
277
  column_name: str
292
278
  data_type: str
279
+ ansi_normalized_column_name: str
280
+ source_normalized_column_name: str
293
281
 
294
282
 
295
283
  @dataclass
@@ -31,7 +31,8 @@ class DataReconcileOutput:
31
31
 
32
32
  @dataclass
33
33
  class SchemaMatchResult:
34
- source_column: str
34
+ source_column_normalized: str
35
+ source_column_normalized_ansi: str
35
36
  source_datatype: str
36
37
  databricks_column: str
37
38
  databricks_datatype: str