databricks-labs-lakebridge 0.10.6__py3-none-any.whl → 0.10.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- databricks/labs/lakebridge/__about__.py +1 -1
- databricks/labs/lakebridge/analyzer/__init__.py +0 -0
- databricks/labs/lakebridge/analyzer/lakebridge_analyzer.py +95 -0
- databricks/labs/lakebridge/assessments/profiler_validator.py +103 -0
- databricks/labs/lakebridge/base_install.py +20 -3
- databricks/labs/lakebridge/cli.py +32 -59
- databricks/labs/lakebridge/contexts/application.py +7 -0
- databricks/labs/lakebridge/deployment/job.py +2 -2
- databricks/labs/lakebridge/helpers/file_utils.py +36 -0
- databricks/labs/lakebridge/helpers/validation.py +5 -3
- databricks/labs/lakebridge/install.py +73 -484
- databricks/labs/lakebridge/reconcile/compare.py +70 -33
- databricks/labs/lakebridge/reconcile/connectors/data_source.py +24 -1
- databricks/labs/lakebridge/reconcile/connectors/databricks.py +12 -1
- databricks/labs/lakebridge/reconcile/connectors/dialect_utils.py +126 -0
- databricks/labs/lakebridge/reconcile/connectors/models.py +7 -0
- databricks/labs/lakebridge/reconcile/connectors/oracle.py +12 -1
- databricks/labs/lakebridge/reconcile/connectors/secrets.py +19 -1
- databricks/labs/lakebridge/reconcile/connectors/snowflake.py +63 -30
- databricks/labs/lakebridge/reconcile/connectors/tsql.py +28 -2
- databricks/labs/lakebridge/reconcile/constants.py +4 -3
- databricks/labs/lakebridge/reconcile/execute.py +9 -810
- databricks/labs/lakebridge/reconcile/normalize_recon_config_service.py +133 -0
- databricks/labs/lakebridge/reconcile/query_builder/base.py +53 -18
- databricks/labs/lakebridge/reconcile/query_builder/expression_generator.py +8 -2
- databricks/labs/lakebridge/reconcile/query_builder/hash_query.py +7 -13
- databricks/labs/lakebridge/reconcile/query_builder/sampling_query.py +18 -19
- databricks/labs/lakebridge/reconcile/query_builder/threshold_query.py +36 -15
- databricks/labs/lakebridge/reconcile/recon_config.py +3 -15
- databricks/labs/lakebridge/reconcile/recon_output_config.py +2 -1
- databricks/labs/lakebridge/reconcile/reconciliation.py +511 -0
- databricks/labs/lakebridge/reconcile/schema_compare.py +26 -19
- databricks/labs/lakebridge/reconcile/trigger_recon_aggregate_service.py +78 -0
- databricks/labs/lakebridge/reconcile/trigger_recon_service.py +256 -0
- databricks/labs/lakebridge/reconcile/utils.py +38 -0
- databricks/labs/lakebridge/transpiler/execute.py +34 -28
- databricks/labs/lakebridge/transpiler/installers.py +523 -0
- databricks/labs/lakebridge/transpiler/lsp/lsp_engine.py +47 -60
- databricks/labs/lakebridge/transpiler/sqlglot/dialect_utils.py +2 -0
- databricks/labs/lakebridge/transpiler/transpile_engine.py +0 -18
- {databricks_labs_lakebridge-0.10.6.dist-info → databricks_labs_lakebridge-0.10.8.dist-info}/METADATA +1 -1
- {databricks_labs_lakebridge-0.10.6.dist-info → databricks_labs_lakebridge-0.10.8.dist-info}/RECORD +46 -35
- {databricks_labs_lakebridge-0.10.6.dist-info → databricks_labs_lakebridge-0.10.8.dist-info}/WHEEL +0 -0
- {databricks_labs_lakebridge-0.10.6.dist-info → databricks_labs_lakebridge-0.10.8.dist-info}/entry_points.txt +0 -0
- {databricks_labs_lakebridge-0.10.6.dist-info → databricks_labs_lakebridge-0.10.8.dist-info}/licenses/LICENSE +0 -0
- {databricks_labs_lakebridge-0.10.6.dist-info → databricks_labs_lakebridge-0.10.8.dist-info}/licenses/NOTICE +0 -0
@@ -0,0 +1,133 @@
|
|
1
|
+
import dataclasses
|
2
|
+
|
3
|
+
from databricks.labs.lakebridge.reconcile.connectors.data_source import DataSource
|
4
|
+
from databricks.labs.lakebridge.reconcile.recon_config import (
|
5
|
+
Table,
|
6
|
+
Aggregate,
|
7
|
+
ColumnMapping,
|
8
|
+
Transformation,
|
9
|
+
ColumnThresholds,
|
10
|
+
)
|
11
|
+
|
12
|
+
|
13
|
+
class NormalizeReconConfigService:
|
14
|
+
def __init__(self, source: DataSource, target: DataSource):
|
15
|
+
self.source = source
|
16
|
+
self.target = target
|
17
|
+
|
18
|
+
def normalize_recon_table_config(self, table: Table) -> Table:
|
19
|
+
normalized_table = dataclasses.replace(table)
|
20
|
+
|
21
|
+
self._normalize_sampling(normalized_table)
|
22
|
+
self._normalize_aggs(normalized_table)
|
23
|
+
self._normalize_join_cols(normalized_table)
|
24
|
+
self._normalize_select_cols(normalized_table)
|
25
|
+
self._normalize_drop_cols(normalized_table)
|
26
|
+
self._normalize_col_mappings(normalized_table)
|
27
|
+
self._normalize_transformations(normalized_table)
|
28
|
+
self._normalize_col_thresholds(normalized_table)
|
29
|
+
self._normalize_jdbc_options(normalized_table)
|
30
|
+
|
31
|
+
return normalized_table
|
32
|
+
|
33
|
+
def _normalize_sampling(self, table: Table):
|
34
|
+
if table.sampling_options:
|
35
|
+
normalized_sampling = dataclasses.replace(table.sampling_options)
|
36
|
+
normalized_sampling.stratified_columns = (
|
37
|
+
[self.source.normalize_identifier(c).ansi_normalized for c in normalized_sampling.stratified_columns]
|
38
|
+
if normalized_sampling.stratified_columns
|
39
|
+
else None
|
40
|
+
)
|
41
|
+
table.sampling_options = normalized_sampling
|
42
|
+
return table
|
43
|
+
|
44
|
+
def _normalize_aggs(self, table: Table):
|
45
|
+
normalized = [self._normalize_agg(a) for a in table.aggregates] if table.aggregates else None
|
46
|
+
table.aggregates = normalized
|
47
|
+
return table
|
48
|
+
|
49
|
+
def _normalize_agg(self, agg: Aggregate) -> Aggregate:
|
50
|
+
normalized = dataclasses.replace(agg)
|
51
|
+
normalized.agg_columns = [self.source.normalize_identifier(c).ansi_normalized for c in normalized.agg_columns]
|
52
|
+
normalized.group_by_columns = (
|
53
|
+
[self.source.normalize_identifier(c).ansi_normalized for c in normalized.group_by_columns]
|
54
|
+
if normalized.group_by_columns
|
55
|
+
else None
|
56
|
+
)
|
57
|
+
return normalized
|
58
|
+
|
59
|
+
def _normalize_join_cols(self, table: Table):
|
60
|
+
table.join_columns = (
|
61
|
+
[self.source.normalize_identifier(c).ansi_normalized for c in table.join_columns]
|
62
|
+
if table.join_columns
|
63
|
+
else None
|
64
|
+
)
|
65
|
+
return table
|
66
|
+
|
67
|
+
def _normalize_select_cols(self, table: Table):
|
68
|
+
table.select_columns = (
|
69
|
+
[self.source.normalize_identifier(c).ansi_normalized for c in table.select_columns]
|
70
|
+
if table.select_columns
|
71
|
+
else None
|
72
|
+
)
|
73
|
+
return table
|
74
|
+
|
75
|
+
def _normalize_drop_cols(self, table: Table):
|
76
|
+
table.drop_columns = (
|
77
|
+
[self.source.normalize_identifier(c).ansi_normalized for c in table.drop_columns]
|
78
|
+
if table.drop_columns
|
79
|
+
else None
|
80
|
+
)
|
81
|
+
return table
|
82
|
+
|
83
|
+
def _normalize_col_mappings(self, table: Table):
|
84
|
+
table.column_mapping = (
|
85
|
+
[self._normalize_col_mapping(m) for m in table.column_mapping] if table.column_mapping else None
|
86
|
+
)
|
87
|
+
return table
|
88
|
+
|
89
|
+
def _normalize_col_mapping(self, mapping: ColumnMapping):
|
90
|
+
return ColumnMapping(
|
91
|
+
source_name=self.source.normalize_identifier(mapping.source_name).ansi_normalized,
|
92
|
+
target_name=self.target.normalize_identifier(mapping.target_name).ansi_normalized,
|
93
|
+
)
|
94
|
+
|
95
|
+
def _normalize_transformations(self, table: Table):
|
96
|
+
table.transformations = (
|
97
|
+
[self._normalize_transformation(t) for t in table.transformations] if table.transformations else None
|
98
|
+
)
|
99
|
+
return table
|
100
|
+
|
101
|
+
def _normalize_transformation(self, transform: Transformation):
|
102
|
+
"""normalize user-configured transformations
|
103
|
+
|
104
|
+
The user configures the table column and passes SQL code to transform the source table and target table.
|
105
|
+
This is useful in scenarios when the data changes e.g. migrating `datetime`s. The SQL code is not normalized
|
106
|
+
and it is the user responsibility to pass valid SQL respecting source database and target database.
|
107
|
+
"""
|
108
|
+
normalized = dataclasses.replace(transform)
|
109
|
+
normalized.column_name = self.source.normalize_identifier(transform.column_name).ansi_normalized
|
110
|
+
return normalized
|
111
|
+
|
112
|
+
def _normalize_col_thresholds(self, table: Table):
|
113
|
+
table.column_thresholds = (
|
114
|
+
[self._normalize_col_threshold(t) for t in table.column_thresholds] if table.column_thresholds else None
|
115
|
+
)
|
116
|
+
return table
|
117
|
+
|
118
|
+
def _normalize_col_threshold(self, threshold: ColumnThresholds):
|
119
|
+
normalized = dataclasses.replace(threshold)
|
120
|
+
normalized.column_name = self.source.normalize_identifier(threshold.column_name).ansi_normalized
|
121
|
+
return normalized
|
122
|
+
|
123
|
+
def _normalize_jdbc_options(self, table: Table):
|
124
|
+
if table.jdbc_reader_options:
|
125
|
+
normalized = dataclasses.replace(table.jdbc_reader_options)
|
126
|
+
normalized.partition_column = (
|
127
|
+
self.source.normalize_identifier(normalized.partition_column).ansi_normalized
|
128
|
+
if normalized.partition_column
|
129
|
+
else None
|
130
|
+
)
|
131
|
+
table.jdbc_reader_options = normalized
|
132
|
+
|
133
|
+
return table
|
@@ -4,10 +4,13 @@ from abc import ABC
|
|
4
4
|
import sqlglot.expressions as exp
|
5
5
|
from sqlglot import Dialect, parse_one
|
6
6
|
|
7
|
+
from databricks.labs.lakebridge.reconcile.connectors.data_source import DataSource
|
8
|
+
from databricks.labs.lakebridge.reconcile.connectors.dialect_utils import DialectUtils
|
7
9
|
from databricks.labs.lakebridge.reconcile.exception import InvalidInputException
|
8
10
|
from databricks.labs.lakebridge.reconcile.query_builder.expression_generator import (
|
9
11
|
DataType_transform_mapping,
|
10
12
|
transform_expression,
|
13
|
+
build_column,
|
11
14
|
)
|
12
15
|
from databricks.labs.lakebridge.reconcile.recon_config import Schema, Table, Aggregate
|
13
16
|
from databricks.labs.lakebridge.transpiler.sqlglot.dialect_utils import get_dialect, SQLGLOT_DIALECTS
|
@@ -16,21 +19,16 @@ logger = logging.getLogger(__name__)
|
|
16
19
|
|
17
20
|
|
18
21
|
class QueryBuilder(ABC):
|
19
|
-
def __init__(
|
20
|
-
self,
|
21
|
-
table_conf: Table,
|
22
|
-
schema: list[Schema],
|
23
|
-
layer: str,
|
24
|
-
engine: Dialect,
|
25
|
-
):
|
22
|
+
def __init__(self, table_conf: Table, schema: list[Schema], layer: str, engine: Dialect, data_source: DataSource):
|
26
23
|
self._table_conf = table_conf
|
27
24
|
self._schema = schema
|
28
25
|
self._layer = layer
|
29
26
|
self._engine = engine
|
27
|
+
self._data_source = data_source
|
30
28
|
|
31
29
|
@property
|
32
30
|
def engine(self) -> Dialect:
|
33
|
-
return self._engine
|
31
|
+
return self._engine if self.layer == "source" else get_dialect("databricks")
|
34
32
|
|
35
33
|
@property
|
36
34
|
def layer(self) -> str:
|
@@ -70,7 +68,25 @@ class QueryBuilder(ABC):
|
|
70
68
|
|
71
69
|
@property
|
72
70
|
def user_transformations(self) -> dict[str, str]:
|
73
|
-
|
71
|
+
if self._table_conf.transformations:
|
72
|
+
if self.layer == "source":
|
73
|
+
return {
|
74
|
+
trans.column_name: (
|
75
|
+
trans.source
|
76
|
+
if trans.source
|
77
|
+
else self._data_source.normalize_identifier(trans.column_name).source_normalized
|
78
|
+
)
|
79
|
+
for trans in self._table_conf.transformations
|
80
|
+
}
|
81
|
+
return {
|
82
|
+
self._table_conf.get_layer_src_to_tgt_col_mapping(trans.column_name, self.layer): (
|
83
|
+
trans.target
|
84
|
+
if trans.target
|
85
|
+
else self._table_conf.get_layer_src_to_tgt_col_mapping(trans.column_name, self.layer)
|
86
|
+
)
|
87
|
+
for trans in self._table_conf.transformations
|
88
|
+
}
|
89
|
+
return {}
|
74
90
|
|
75
91
|
@property
|
76
92
|
def aggregates(self) -> list[Aggregate] | None:
|
@@ -93,10 +109,12 @@ class QueryBuilder(ABC):
|
|
93
109
|
|
94
110
|
def _user_transformer(self, node: exp.Expression, user_transformations: dict[str, str]) -> exp.Expression:
|
95
111
|
if isinstance(node, exp.Column) and user_transformations:
|
96
|
-
|
97
|
-
|
98
|
-
if
|
99
|
-
return parse_one(
|
112
|
+
normalized_column = self._data_source.normalize_identifier(node.name)
|
113
|
+
ansi_name = normalized_column.ansi_normalized
|
114
|
+
if ansi_name in user_transformations.keys():
|
115
|
+
return parse_one(
|
116
|
+
user_transformations.get(ansi_name, normalized_column.source_normalized), read=self.engine
|
117
|
+
)
|
100
118
|
return node
|
101
119
|
|
102
120
|
def _apply_default_transformation(
|
@@ -107,8 +125,7 @@ class QueryBuilder(ABC):
|
|
107
125
|
with_transform.append(alias.transform(self._default_transformer, schema, source))
|
108
126
|
return with_transform
|
109
127
|
|
110
|
-
|
111
|
-
def _default_transformer(node: exp.Expression, schema: list[Schema], source: Dialect) -> exp.Expression:
|
128
|
+
def _default_transformer(self, node: exp.Expression, schema: list[Schema], source: Dialect) -> exp.Expression:
|
112
129
|
|
113
130
|
def _get_transform(datatype: str):
|
114
131
|
source_dialects = [source_key for source_key, dialect in SQLGLOT_DIALECTS.items() if dialect == source]
|
@@ -125,9 +142,10 @@ class QueryBuilder(ABC):
|
|
125
142
|
|
126
143
|
schema_dict = {v.column_name: v.data_type for v in schema}
|
127
144
|
if isinstance(node, exp.Column):
|
128
|
-
|
129
|
-
|
130
|
-
|
145
|
+
normalized_column = self._data_source.normalize_identifier(node.name)
|
146
|
+
ansi_name = normalized_column.ansi_normalized
|
147
|
+
if ansi_name in schema_dict.keys():
|
148
|
+
transform = _get_transform(schema_dict.get(ansi_name, normalized_column.source_normalized))
|
131
149
|
return transform_expression(node, transform)
|
132
150
|
return node
|
133
151
|
|
@@ -136,3 +154,20 @@ class QueryBuilder(ABC):
|
|
136
154
|
message = f"Exception for {self.table_conf.target_name} target table in {self.layer} layer --> {message}"
|
137
155
|
logger.error(message)
|
138
156
|
raise InvalidInputException(message)
|
157
|
+
|
158
|
+
def _build_column_with_alias(self, column: str):
|
159
|
+
return build_column(
|
160
|
+
this=self._build_column_name_source_normalized(column),
|
161
|
+
alias=DialectUtils.unnormalize_identifier(
|
162
|
+
self.table_conf.get_layer_tgt_to_src_col_mapping(column, self.layer)
|
163
|
+
),
|
164
|
+
quoted=True,
|
165
|
+
)
|
166
|
+
|
167
|
+
def _build_column_name_source_normalized(self, column: str):
|
168
|
+
return self._data_source.normalize_identifier(column).source_normalized
|
169
|
+
|
170
|
+
def _build_alias_source_normalized(self, column: str):
|
171
|
+
return self._data_source.normalize_identifier(
|
172
|
+
self.table_conf.get_layer_tgt_to_src_col_mapping(column, self.layer)
|
173
|
+
).source_normalized
|
@@ -125,6 +125,7 @@ def anonymous(expr: exp.Column, func: str, is_expr: bool = False, dialect=None)
|
|
125
125
|
return new_expr
|
126
126
|
|
127
127
|
|
128
|
+
# TODO Standardize impl and use quoted and Identifier/Column consistently
|
128
129
|
def build_column(this: exp.ExpOrStr, table_name="", quoted=False, alias=None) -> exp.Expression:
|
129
130
|
if alias:
|
130
131
|
if isinstance(this, str):
|
@@ -135,6 +136,10 @@ def build_column(this: exp.ExpOrStr, table_name="", quoted=False, alias=None) ->
|
|
135
136
|
return exp.Column(this=exp.Identifier(this=this, quoted=quoted), table=table_name)
|
136
137
|
|
137
138
|
|
139
|
+
def build_column_no_alias(this: str, table_name="") -> exp.Expression:
|
140
|
+
return exp.Column(this=this, table=table_name)
|
141
|
+
|
142
|
+
|
138
143
|
def build_literal(this: exp.ExpOrStr, alias=None, quoted=False, is_string=True, cast=None) -> exp.Expression:
|
139
144
|
base_literal = exp.Literal(this=this, is_string=is_string)
|
140
145
|
if not cast and not alias:
|
@@ -207,10 +212,11 @@ def build_sub(
|
|
207
212
|
right_column_name: str,
|
208
213
|
left_table_name: str | None = None,
|
209
214
|
right_table_name: str | None = None,
|
215
|
+
quoted: bool = False,
|
210
216
|
) -> exp.Sub:
|
211
217
|
return exp.Sub(
|
212
|
-
this=build_column(left_column_name, left_table_name),
|
213
|
-
expression=build_column(right_column_name, right_table_name),
|
218
|
+
this=build_column(left_column_name, left_table_name, quoted=quoted),
|
219
|
+
expression=build_column(right_column_name, right_table_name, quoted=quoted),
|
214
220
|
)
|
215
221
|
|
216
222
|
|
@@ -11,8 +11,8 @@ from databricks.labs.lakebridge.reconcile.query_builder.expression_generator imp
|
|
11
11
|
get_hash_transform,
|
12
12
|
lower,
|
13
13
|
transform_expression,
|
14
|
+
build_column_no_alias,
|
14
15
|
)
|
15
|
-
from databricks.labs.lakebridge.transpiler.sqlglot.dialect_utils import get_dialect
|
16
16
|
|
17
17
|
logger = logging.getLogger(__name__)
|
18
18
|
|
@@ -41,15 +41,12 @@ class HashQueryBuilder(QueryBuilder):
|
|
41
41
|
|
42
42
|
key_cols = hash_cols if report_type == "row" else sorted(_join_columns | self.partition_column)
|
43
43
|
|
44
|
-
cols_with_alias = [
|
45
|
-
build_column(this=col, alias=self.table_conf.get_layer_tgt_to_src_col_mapping(col, self.layer))
|
46
|
-
for col in key_cols
|
47
|
-
]
|
44
|
+
cols_with_alias = [self._build_column_with_alias(col) for col in key_cols]
|
48
45
|
|
49
46
|
# in case if we have column mapping, we need to sort the target columns in the order of source columns to get
|
50
47
|
# same hash value
|
51
48
|
hash_cols_with_alias = [
|
52
|
-
{"this": col, "alias": self.
|
49
|
+
{"this": self._build_column_name_source_normalized(col), "alias": self._build_alias_source_normalized(col)}
|
53
50
|
for col in hash_cols
|
54
51
|
]
|
55
52
|
sorted_hash_cols_with_alias = sorted(hash_cols_with_alias, key=lambda column: column["alias"])
|
@@ -60,12 +57,11 @@ class HashQueryBuilder(QueryBuilder):
|
|
60
57
|
)
|
61
58
|
hash_col_with_transform = [self._generate_hash_algorithm(hashcols_sorted_as_src_seq, _HASH_COLUMN_NAME)]
|
62
59
|
|
63
|
-
dialect = self.engine if self.layer == "source" else get_dialect("databricks")
|
64
60
|
res = (
|
65
61
|
exp.select(*hash_col_with_transform + key_cols_with_transform)
|
66
62
|
.from_(":tbl")
|
67
|
-
.where(self.filter)
|
68
|
-
.sql(dialect=
|
63
|
+
.where(self.filter, dialect=self.engine)
|
64
|
+
.sql(dialect=self.engine)
|
69
65
|
)
|
70
66
|
|
71
67
|
logger.info(f"Hash Query for {self.layer}: {res}")
|
@@ -76,10 +72,8 @@ class HashQueryBuilder(QueryBuilder):
|
|
76
72
|
cols: list[str],
|
77
73
|
column_alias: str,
|
78
74
|
) -> exp.Expression:
|
79
|
-
|
80
|
-
cols_with_transform = self.add_transformations(
|
81
|
-
cols_with_alias, self.engine if self.layer == "source" else get_dialect("databricks")
|
82
|
-
)
|
75
|
+
cols_no_alias = [build_column_no_alias(this=col) for col in cols]
|
76
|
+
cols_with_transform = self.add_transformations(cols_no_alias, self.engine)
|
83
77
|
col_exprs = exp.select(*cols_with_transform).iter_expressions()
|
84
78
|
concat_expr = concat(list(col_exprs))
|
85
79
|
|
@@ -4,6 +4,7 @@ import sqlglot.expressions as exp
|
|
4
4
|
from pyspark.sql import DataFrame
|
5
5
|
from sqlglot import select
|
6
6
|
|
7
|
+
from databricks.labs.lakebridge.reconcile.connectors.dialect_utils import DialectUtils
|
7
8
|
from databricks.labs.lakebridge.transpiler.sqlglot.dialect_utils import get_key_from_dialect
|
8
9
|
from databricks.labs.lakebridge.reconcile.query_builder.base import QueryBuilder
|
9
10
|
from databricks.labs.lakebridge.reconcile.query_builder.expression_generator import (
|
@@ -37,12 +38,9 @@ class SamplingQueryBuilder(QueryBuilder):
|
|
37
38
|
|
38
39
|
cols = sorted((join_columns | self.select_columns) - self.threshold_columns - self.drop_columns)
|
39
40
|
|
40
|
-
cols_with_alias = [
|
41
|
-
build_column(this=col, alias=self.table_conf.get_layer_tgt_to_src_col_mapping(col, self.layer))
|
42
|
-
for col in cols
|
43
|
-
]
|
41
|
+
cols_with_alias = [self._build_column_with_alias(col) for col in cols]
|
44
42
|
|
45
|
-
query = select(*cols_with_alias).from_(":tbl").where(self.filter).sql(dialect=self.engine)
|
43
|
+
query = select(*cols_with_alias).from_(":tbl").where(self.filter, dialect=self.engine).sql(dialect=self.engine)
|
46
44
|
|
47
45
|
logger.info(f"Sampling Query with Alias for {self.layer}: {query}")
|
48
46
|
return query
|
@@ -59,22 +57,22 @@ class SamplingQueryBuilder(QueryBuilder):
|
|
59
57
|
|
60
58
|
cols = sorted((join_columns | self.select_columns) - self.threshold_columns - self.drop_columns)
|
61
59
|
|
62
|
-
cols_with_alias = [
|
63
|
-
build_column(this=col, alias=self.table_conf.get_layer_tgt_to_src_col_mapping(col, self.layer))
|
64
|
-
for col in cols
|
65
|
-
]
|
60
|
+
cols_with_alias = [self._build_column_with_alias(col) for col in cols]
|
66
61
|
|
67
62
|
sql_with_transforms = self.add_transformations(cols_with_alias, self.engine)
|
68
|
-
query_sql = select(*sql_with_transforms).from_(":tbl").where(self.filter)
|
63
|
+
query_sql = select(*sql_with_transforms).from_(":tbl").where(self.filter, dialect=self.engine)
|
69
64
|
if self.layer == "source":
|
70
|
-
with_select = [
|
65
|
+
with_select = [
|
66
|
+
build_column(this=DialectUtils.unnormalize_identifier(col), table_name="src", quoted=True)
|
67
|
+
for col in sorted(cols)
|
68
|
+
]
|
71
69
|
else:
|
72
70
|
with_select = [
|
73
|
-
build_column(this=col, table_name="src")
|
71
|
+
build_column(this=DialectUtils.unnormalize_identifier(col), table_name="src", quoted=True)
|
74
72
|
for col in sorted(self.table_conf.get_tgt_to_src_col_mapping_list(cols))
|
75
73
|
]
|
76
74
|
|
77
|
-
join_clause =
|
75
|
+
join_clause = self._get_join_clause(key_cols)
|
78
76
|
|
79
77
|
query = (
|
80
78
|
with_clause.with_(alias="src", as_=query_sql)
|
@@ -86,10 +84,10 @@ class SamplingQueryBuilder(QueryBuilder):
|
|
86
84
|
logger.info(f"Sampling Query for {self.layer}: {query}")
|
87
85
|
return query
|
88
86
|
|
89
|
-
|
90
|
-
|
87
|
+
def _get_join_clause(self, key_cols: list):
|
88
|
+
normalized = [self._build_column_name_source_normalized(col) for col in key_cols]
|
91
89
|
return build_join_clause(
|
92
|
-
"recon",
|
90
|
+
"recon", normalized, source_table_alias="src", target_table_alias="recon", kind="inner", func=exp.EQ
|
93
91
|
)
|
94
92
|
|
95
93
|
def _get_with_clause(self, df: DataFrame) -> exp.Select:
|
@@ -106,12 +104,13 @@ class SamplingQueryBuilder(QueryBuilder):
|
|
106
104
|
(
|
107
105
|
build_literal(
|
108
106
|
this=str(value),
|
109
|
-
alias=col,
|
107
|
+
alias=DialectUtils.unnormalize_identifier(col),
|
110
108
|
is_string=_get_is_string(column_types_dict, col),
|
111
|
-
cast=orig_types_dict.get(col),
|
109
|
+
cast=orig_types_dict.get(DialectUtils.ansi_normalize_identifier(col)),
|
110
|
+
quoted=True,
|
112
111
|
)
|
113
112
|
if value is not None
|
114
|
-
else exp.Alias(this=exp.Null(), alias=col)
|
113
|
+
else exp.Alias(this=exp.Null(), alias=DialectUtils.unnormalize_identifier(col), quoted=True)
|
115
114
|
)
|
116
115
|
for col, value in zip(df.columns, row)
|
117
116
|
]
|
@@ -3,6 +3,7 @@ import logging
|
|
3
3
|
from sqlglot import expressions as exp
|
4
4
|
from sqlglot import select
|
5
5
|
|
6
|
+
from databricks.labs.lakebridge.reconcile.connectors.dialect_utils import DialectUtils
|
6
7
|
from databricks.labs.lakebridge.reconcile.query_builder.base import QueryBuilder
|
7
8
|
from databricks.labs.lakebridge.reconcile.query_builder.expression_generator import (
|
8
9
|
anonymous,
|
@@ -54,6 +55,7 @@ class ThresholdQueryBuilder(QueryBuilder):
|
|
54
55
|
left_table_name="source",
|
55
56
|
right_column_name=column,
|
56
57
|
right_table_name="databricks",
|
58
|
+
quoted=False,
|
57
59
|
)
|
58
60
|
).transform(coalesce)
|
59
61
|
|
@@ -62,7 +64,14 @@ class ThresholdQueryBuilder(QueryBuilder):
|
|
62
64
|
where_clause.append(where)
|
63
65
|
# join columns
|
64
66
|
for column in sorted(join_columns):
|
65
|
-
select_clause.append(
|
67
|
+
select_clause.append(
|
68
|
+
build_column(
|
69
|
+
this=column,
|
70
|
+
alias=f"{DialectUtils.unnormalize_identifier(column)}_source",
|
71
|
+
table_name="source",
|
72
|
+
quoted=True,
|
73
|
+
)
|
74
|
+
)
|
66
75
|
where = build_where_clause(where_clause)
|
67
76
|
|
68
77
|
return select_clause, where
|
@@ -76,10 +85,20 @@ class ThresholdQueryBuilder(QueryBuilder):
|
|
76
85
|
select_clause = []
|
77
86
|
column = threshold.column_name
|
78
87
|
select_clause.append(
|
79
|
-
build_column(
|
88
|
+
build_column(
|
89
|
+
this=column,
|
90
|
+
alias=f"{DialectUtils.unnormalize_identifier(column)}_source",
|
91
|
+
table_name="source",
|
92
|
+
quoted=True,
|
93
|
+
).transform(coalesce)
|
80
94
|
)
|
81
95
|
select_clause.append(
|
82
|
-
build_column(
|
96
|
+
build_column(
|
97
|
+
this=column,
|
98
|
+
alias=f"{DialectUtils.unnormalize_identifier(column)}_databricks",
|
99
|
+
table_name="databricks",
|
100
|
+
quoted=True,
|
101
|
+
).transform(coalesce)
|
83
102
|
)
|
84
103
|
where_clause = exp.NEQ(this=base, expression=exp.Literal(this="0", is_string=False))
|
85
104
|
return select_clause, where_clause
|
@@ -110,7 +129,13 @@ class ThresholdQueryBuilder(QueryBuilder):
|
|
110
129
|
logger.error(error_message)
|
111
130
|
raise ValueError(error_message)
|
112
131
|
|
113
|
-
select_clause.append(
|
132
|
+
select_clause.append(
|
133
|
+
build_column(
|
134
|
+
this=func(base=base, threshold=threshold),
|
135
|
+
alias=f"{DialectUtils.unnormalize_identifier(column)}_match",
|
136
|
+
quoted=True,
|
137
|
+
)
|
138
|
+
)
|
114
139
|
|
115
140
|
return select_clause, where_clause
|
116
141
|
|
@@ -170,8 +195,8 @@ class ThresholdQueryBuilder(QueryBuilder):
|
|
170
195
|
),
|
171
196
|
expression=exp.Is(
|
172
197
|
this=exp.Column(
|
173
|
-
this=
|
174
|
-
table=
|
198
|
+
this=threshold.column_name,
|
199
|
+
table="databricks",
|
175
200
|
),
|
176
201
|
expression=exp.Null(),
|
177
202
|
),
|
@@ -211,21 +236,17 @@ class ThresholdQueryBuilder(QueryBuilder):
|
|
211
236
|
self._validate(self.join_columns, "Join Columns are compulsory for threshold query")
|
212
237
|
join_columns = self.join_columns if self.join_columns else set()
|
213
238
|
keys: list[str] = sorted(self.partition_column.union(join_columns))
|
214
|
-
keys_select_alias = [
|
215
|
-
build_column(this=col, alias=self.table_conf.get_layer_tgt_to_src_col_mapping(col, self.layer))
|
216
|
-
for col in keys
|
217
|
-
]
|
239
|
+
keys_select_alias = [self._build_column_with_alias(col) for col in keys]
|
218
240
|
keys_expr = self._apply_user_transformation(keys_select_alias)
|
219
241
|
|
220
242
|
# threshold column expression
|
221
|
-
threshold_alias = [
|
222
|
-
build_column(this=col, alias=self.table_conf.get_layer_tgt_to_src_col_mapping(col, self.layer))
|
223
|
-
for col in sorted(self.threshold_columns)
|
224
|
-
]
|
243
|
+
threshold_alias = [self._build_column_with_alias(col) for col in sorted(self.threshold_columns)]
|
225
244
|
thresholds_expr = threshold_alias
|
226
245
|
if self.user_transformations:
|
227
246
|
thresholds_expr = self._apply_user_transformation(threshold_alias)
|
228
247
|
|
229
|
-
query = (select(*keys_expr + thresholds_expr).from_(":tbl").where(self.filter
|
248
|
+
query = (select(*keys_expr + thresholds_expr).from_(":tbl").where(self.filter, dialect=self.engine)).sql(
|
249
|
+
dialect=self.engine
|
250
|
+
)
|
230
251
|
logger.info(f"Threshold Query for {self.layer}: {query}")
|
231
252
|
return query
|
@@ -257,21 +257,6 @@ class Table:
|
|
257
257
|
return set()
|
258
258
|
return {self.get_layer_src_to_tgt_col_mapping(col, layer) for col in self.drop_columns}
|
259
259
|
|
260
|
-
def get_transformation_dict(self, layer: str) -> dict[str, str]:
|
261
|
-
if self.transformations:
|
262
|
-
if layer == "source":
|
263
|
-
return {
|
264
|
-
trans.column_name: (trans.source if trans.source else trans.column_name)
|
265
|
-
for trans in self.transformations
|
266
|
-
}
|
267
|
-
return {
|
268
|
-
self.get_layer_src_to_tgt_col_mapping(trans.column_name, layer): (
|
269
|
-
trans.target if trans.target else self.get_layer_src_to_tgt_col_mapping(trans.column_name, layer)
|
270
|
-
)
|
271
|
-
for trans in self.transformations
|
272
|
-
}
|
273
|
-
return {}
|
274
|
-
|
275
260
|
def get_partition_column(self, layer: str) -> set[str]:
|
276
261
|
if self.jdbc_reader_options and layer == "source":
|
277
262
|
if self.jdbc_reader_options.partition_column:
|
@@ -288,8 +273,11 @@ class Table:
|
|
288
273
|
|
289
274
|
@dataclass
|
290
275
|
class Schema:
|
276
|
+
# TODO remove: This will have the value of ansi_normalized_column_name. Kept for backwards compatibility.
|
291
277
|
column_name: str
|
292
278
|
data_type: str
|
279
|
+
ansi_normalized_column_name: str
|
280
|
+
source_normalized_column_name: str
|
293
281
|
|
294
282
|
|
295
283
|
@dataclass
|