databricks-labs-lakebridge 0.10.6__py3-none-any.whl → 0.10.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- databricks/labs/lakebridge/__about__.py +1 -1
- databricks/labs/lakebridge/analyzer/__init__.py +0 -0
- databricks/labs/lakebridge/analyzer/lakebridge_analyzer.py +95 -0
- databricks/labs/lakebridge/assessments/profiler_validator.py +103 -0
- databricks/labs/lakebridge/base_install.py +20 -3
- databricks/labs/lakebridge/cli.py +32 -59
- databricks/labs/lakebridge/contexts/application.py +7 -0
- databricks/labs/lakebridge/deployment/job.py +2 -2
- databricks/labs/lakebridge/helpers/file_utils.py +36 -0
- databricks/labs/lakebridge/helpers/validation.py +5 -3
- databricks/labs/lakebridge/install.py +73 -484
- databricks/labs/lakebridge/reconcile/compare.py +70 -33
- databricks/labs/lakebridge/reconcile/connectors/data_source.py +24 -1
- databricks/labs/lakebridge/reconcile/connectors/databricks.py +12 -1
- databricks/labs/lakebridge/reconcile/connectors/dialect_utils.py +126 -0
- databricks/labs/lakebridge/reconcile/connectors/models.py +7 -0
- databricks/labs/lakebridge/reconcile/connectors/oracle.py +12 -1
- databricks/labs/lakebridge/reconcile/connectors/secrets.py +19 -1
- databricks/labs/lakebridge/reconcile/connectors/snowflake.py +63 -30
- databricks/labs/lakebridge/reconcile/connectors/tsql.py +28 -2
- databricks/labs/lakebridge/reconcile/constants.py +4 -3
- databricks/labs/lakebridge/reconcile/execute.py +9 -810
- databricks/labs/lakebridge/reconcile/normalize_recon_config_service.py +133 -0
- databricks/labs/lakebridge/reconcile/query_builder/base.py +53 -18
- databricks/labs/lakebridge/reconcile/query_builder/expression_generator.py +8 -2
- databricks/labs/lakebridge/reconcile/query_builder/hash_query.py +7 -13
- databricks/labs/lakebridge/reconcile/query_builder/sampling_query.py +18 -19
- databricks/labs/lakebridge/reconcile/query_builder/threshold_query.py +36 -15
- databricks/labs/lakebridge/reconcile/recon_config.py +3 -15
- databricks/labs/lakebridge/reconcile/recon_output_config.py +2 -1
- databricks/labs/lakebridge/reconcile/reconciliation.py +511 -0
- databricks/labs/lakebridge/reconcile/schema_compare.py +26 -19
- databricks/labs/lakebridge/reconcile/trigger_recon_aggregate_service.py +78 -0
- databricks/labs/lakebridge/reconcile/trigger_recon_service.py +256 -0
- databricks/labs/lakebridge/reconcile/utils.py +38 -0
- databricks/labs/lakebridge/transpiler/execute.py +34 -28
- databricks/labs/lakebridge/transpiler/installers.py +523 -0
- databricks/labs/lakebridge/transpiler/lsp/lsp_engine.py +47 -60
- databricks/labs/lakebridge/transpiler/sqlglot/dialect_utils.py +2 -0
- databricks/labs/lakebridge/transpiler/transpile_engine.py +0 -18
- {databricks_labs_lakebridge-0.10.6.dist-info → databricks_labs_lakebridge-0.10.8.dist-info}/METADATA +1 -1
- {databricks_labs_lakebridge-0.10.6.dist-info → databricks_labs_lakebridge-0.10.8.dist-info}/RECORD +46 -35
- {databricks_labs_lakebridge-0.10.6.dist-info → databricks_labs_lakebridge-0.10.8.dist-info}/WHEEL +0 -0
- {databricks_labs_lakebridge-0.10.6.dist-info → databricks_labs_lakebridge-0.10.8.dist-info}/entry_points.txt +0 -0
- {databricks_labs_lakebridge-0.10.6.dist-info → databricks_labs_lakebridge-0.10.8.dist-info}/licenses/LICENSE +0 -0
- {databricks_labs_lakebridge-0.10.6.dist-info → databricks_labs_lakebridge-0.10.8.dist-info}/licenses/NOTICE +0 -0
@@ -0,0 +1,511 @@
|
|
1
|
+
import logging
|
2
|
+
|
3
|
+
from pyspark.sql import DataFrame, SparkSession
|
4
|
+
from sqlglot import Dialect
|
5
|
+
|
6
|
+
from databricks.labs.lakebridge.config import (
|
7
|
+
DatabaseConfig,
|
8
|
+
ReconcileMetadataConfig,
|
9
|
+
)
|
10
|
+
from databricks.labs.lakebridge.reconcile import utils
|
11
|
+
from databricks.labs.lakebridge.reconcile.compare import (
|
12
|
+
capture_mismatch_data_and_columns,
|
13
|
+
reconcile_data,
|
14
|
+
join_aggregate_data,
|
15
|
+
reconcile_agg_data_per_rule,
|
16
|
+
)
|
17
|
+
from databricks.labs.lakebridge.reconcile.connectors.data_source import DataSource
|
18
|
+
from databricks.labs.lakebridge.reconcile.connectors.dialect_utils import DialectUtils
|
19
|
+
from databricks.labs.lakebridge.reconcile.exception import (
|
20
|
+
DataSourceRuntimeException,
|
21
|
+
)
|
22
|
+
from databricks.labs.lakebridge.reconcile.query_builder.aggregate_query import AggregateQueryBuilder
|
23
|
+
from databricks.labs.lakebridge.reconcile.query_builder.count_query import CountQueryBuilder
|
24
|
+
from databricks.labs.lakebridge.reconcile.query_builder.hash_query import HashQueryBuilder
|
25
|
+
from databricks.labs.lakebridge.reconcile.query_builder.sampling_query import (
|
26
|
+
SamplingQueryBuilder,
|
27
|
+
)
|
28
|
+
from databricks.labs.lakebridge.reconcile.query_builder.threshold_query import (
|
29
|
+
ThresholdQueryBuilder,
|
30
|
+
)
|
31
|
+
from databricks.labs.lakebridge.reconcile.recon_config import (
|
32
|
+
Schema,
|
33
|
+
Table,
|
34
|
+
AggregateQueryRules,
|
35
|
+
SamplingOptions,
|
36
|
+
)
|
37
|
+
from databricks.labs.lakebridge.reconcile.recon_output_config import (
|
38
|
+
DataReconcileOutput,
|
39
|
+
ThresholdOutput,
|
40
|
+
ReconcileRecordCount,
|
41
|
+
AggregateQueryOutput,
|
42
|
+
)
|
43
|
+
from databricks.labs.lakebridge.reconcile.sampler import SamplerFactory
|
44
|
+
from databricks.labs.lakebridge.reconcile.schema_compare import SchemaCompare
|
45
|
+
from databricks.labs.lakebridge.transpiler.sqlglot.dialect_utils import get_dialect
|
46
|
+
|
47
|
+
logger = logging.getLogger(__name__)
|
48
|
+
_SAMPLE_ROWS = 50
|
49
|
+
|
50
|
+
|
51
|
+
class Reconciliation:
|
52
|
+
|
53
|
+
def __init__(
|
54
|
+
self,
|
55
|
+
source: DataSource,
|
56
|
+
target: DataSource,
|
57
|
+
database_config: DatabaseConfig,
|
58
|
+
report_type: str,
|
59
|
+
schema_comparator: SchemaCompare,
|
60
|
+
source_engine: Dialect,
|
61
|
+
spark: SparkSession,
|
62
|
+
metadata_config: ReconcileMetadataConfig,
|
63
|
+
):
|
64
|
+
self._source = source
|
65
|
+
self._target = target
|
66
|
+
self._report_type = report_type
|
67
|
+
self._database_config = database_config
|
68
|
+
self._schema_comparator = schema_comparator
|
69
|
+
self._target_engine = get_dialect("databricks")
|
70
|
+
self._source_engine = source_engine
|
71
|
+
self._spark = spark
|
72
|
+
self._metadata_config = metadata_config
|
73
|
+
|
74
|
+
@property
|
75
|
+
def source(self) -> DataSource:
|
76
|
+
return self._source
|
77
|
+
|
78
|
+
@property
|
79
|
+
def target(self) -> DataSource:
|
80
|
+
return self._target
|
81
|
+
|
82
|
+
@property
|
83
|
+
def report_type(self) -> str:
|
84
|
+
return self._report_type
|
85
|
+
|
86
|
+
def reconcile_data(
|
87
|
+
self,
|
88
|
+
table_conf: Table,
|
89
|
+
src_schema: list[Schema],
|
90
|
+
tgt_schema: list[Schema],
|
91
|
+
) -> DataReconcileOutput:
|
92
|
+
data_reconcile_output = self._get_reconcile_output(table_conf, src_schema, tgt_schema)
|
93
|
+
reconcile_output = data_reconcile_output
|
94
|
+
if self._report_type in {"data", "all"}:
|
95
|
+
reconcile_output = self._get_sample_data(table_conf, data_reconcile_output, src_schema, tgt_schema)
|
96
|
+
if table_conf.get_threshold_columns("source"):
|
97
|
+
reconcile_output.threshold_output = self._reconcile_threshold_data(table_conf, src_schema, tgt_schema)
|
98
|
+
|
99
|
+
if self._report_type == "row" and table_conf.get_threshold_columns("source"):
|
100
|
+
logger.warning("Threshold comparison is ignored for 'row' report type")
|
101
|
+
|
102
|
+
return reconcile_output
|
103
|
+
|
104
|
+
def reconcile_schema(
|
105
|
+
self,
|
106
|
+
src_schema: list[Schema],
|
107
|
+
tgt_schema: list[Schema],
|
108
|
+
table_conf: Table,
|
109
|
+
):
|
110
|
+
return self._schema_comparator.compare(src_schema, tgt_schema, self._source_engine, table_conf)
|
111
|
+
|
112
|
+
def reconcile_aggregates(
|
113
|
+
self,
|
114
|
+
table_conf: Table,
|
115
|
+
src_schema: list[Schema],
|
116
|
+
tgt_schema: list[Schema],
|
117
|
+
) -> list[AggregateQueryOutput]:
|
118
|
+
return self._get_reconcile_aggregate_output(table_conf, src_schema, tgt_schema)
|
119
|
+
|
120
|
+
def _get_reconcile_output(
|
121
|
+
self,
|
122
|
+
table_conf,
|
123
|
+
src_schema,
|
124
|
+
tgt_schema,
|
125
|
+
):
|
126
|
+
src_hash_query = HashQueryBuilder(
|
127
|
+
table_conf, src_schema, "source", self._source_engine, self._source
|
128
|
+
).build_query(report_type=self._report_type)
|
129
|
+
tgt_hash_query = HashQueryBuilder(
|
130
|
+
table_conf, tgt_schema, "target", self._source_engine, self._target
|
131
|
+
).build_query(report_type=self._report_type)
|
132
|
+
src_data = self._source.read_data(
|
133
|
+
catalog=self._database_config.source_catalog,
|
134
|
+
schema=self._database_config.source_schema,
|
135
|
+
table=table_conf.source_name,
|
136
|
+
query=src_hash_query,
|
137
|
+
options=table_conf.jdbc_reader_options,
|
138
|
+
)
|
139
|
+
tgt_data = self._target.read_data(
|
140
|
+
catalog=self._database_config.target_catalog,
|
141
|
+
schema=self._database_config.target_schema,
|
142
|
+
table=table_conf.target_name,
|
143
|
+
query=tgt_hash_query,
|
144
|
+
options=table_conf.jdbc_reader_options,
|
145
|
+
)
|
146
|
+
|
147
|
+
volume_path = utils.generate_volume_path(table_conf, self._metadata_config)
|
148
|
+
return reconcile_data(
|
149
|
+
source=src_data,
|
150
|
+
target=tgt_data,
|
151
|
+
key_columns=table_conf.join_columns,
|
152
|
+
report_type=self._report_type,
|
153
|
+
spark=self._spark,
|
154
|
+
path=volume_path,
|
155
|
+
)
|
156
|
+
|
157
|
+
def _get_reconcile_aggregate_output(
|
158
|
+
self,
|
159
|
+
table_conf,
|
160
|
+
src_schema,
|
161
|
+
tgt_schema,
|
162
|
+
):
|
163
|
+
"""
|
164
|
+
Creates a single Query, for the aggregates having the same group by columns. (Ex: 1)
|
165
|
+
If there are no group by columns, all the aggregates are clubbed together in a single query. (Ex: 2)
|
166
|
+
Examples:
|
167
|
+
1. {
|
168
|
+
"type": "MIN",
|
169
|
+
"agg_cols": ["COL1"],
|
170
|
+
"group_by_cols": ["COL4"]
|
171
|
+
},
|
172
|
+
{
|
173
|
+
"type": "MAX",
|
174
|
+
"agg_cols": ["COL2"],
|
175
|
+
"group_by_cols": ["COL9"]
|
176
|
+
},
|
177
|
+
{
|
178
|
+
"type": "COUNT",
|
179
|
+
"agg_cols": ["COL2"],
|
180
|
+
"group_by_cols": ["COL9"]
|
181
|
+
},
|
182
|
+
{
|
183
|
+
"type": "AVG",
|
184
|
+
"agg_cols": ["COL3"],
|
185
|
+
"group_by_cols": ["COL4"]
|
186
|
+
},
|
187
|
+
Query 1: SELECT MIN(COL1), AVG(COL3) FROM :table GROUP BY COL4
|
188
|
+
Rules: ID | Aggregate Type | Column | Group By Column
|
189
|
+
#1, MIN, COL1, COL4
|
190
|
+
#2, AVG, COL3, COL4
|
191
|
+
-------------------------------------------------------
|
192
|
+
Query 2: SELECT MAX(COL2), COUNT(COL2) FROM :table GROUP BY COL9
|
193
|
+
Rules: ID | Aggregate Type | Column | Group By Column
|
194
|
+
#1, MAX, COL2, COL9
|
195
|
+
#2, COUNT, COL2, COL9
|
196
|
+
2. {
|
197
|
+
"type": "MAX",
|
198
|
+
"agg_cols": ["COL1"]
|
199
|
+
},
|
200
|
+
{
|
201
|
+
"type": "SUM",
|
202
|
+
"agg_cols": ["COL2"]
|
203
|
+
},
|
204
|
+
{
|
205
|
+
"type": "MAX",
|
206
|
+
"agg_cols": ["COL3"]
|
207
|
+
}
|
208
|
+
Query: SELECT MAX(COL1), SUM(COL2), MAX(COL3) FROM :table
|
209
|
+
Rules: ID | Aggregate Type | Column | Group By Column
|
210
|
+
#1, MAX, COL1,
|
211
|
+
#2, SUM, COL2,
|
212
|
+
#3, MAX, COL3,
|
213
|
+
"""
|
214
|
+
|
215
|
+
src_query_builder = AggregateQueryBuilder(
|
216
|
+
table_conf,
|
217
|
+
src_schema,
|
218
|
+
"source",
|
219
|
+
self._source_engine,
|
220
|
+
self._source,
|
221
|
+
)
|
222
|
+
|
223
|
+
# build Aggregate queries for source,
|
224
|
+
src_agg_queries: list[AggregateQueryRules] = src_query_builder.build_queries()
|
225
|
+
|
226
|
+
# There could be one or more queries per table based on the group by columns
|
227
|
+
|
228
|
+
# build Aggregate queries for target(Databricks),
|
229
|
+
tgt_agg_queries: list[AggregateQueryRules] = AggregateQueryBuilder(
|
230
|
+
table_conf,
|
231
|
+
tgt_schema,
|
232
|
+
"target",
|
233
|
+
self._target_engine,
|
234
|
+
self._target,
|
235
|
+
).build_queries()
|
236
|
+
|
237
|
+
volume_path = utils.generate_volume_path(table_conf, self._metadata_config)
|
238
|
+
|
239
|
+
table_agg_output: list[AggregateQueryOutput] = []
|
240
|
+
|
241
|
+
# Iterate over the grouped aggregates and reconcile the data
|
242
|
+
# Zip all the keys, read the source, target data for each Aggregate query
|
243
|
+
# and reconcile on the aggregate data
|
244
|
+
# For e.g., (source_query_GRP1, target_query_GRP1), (source_query_GRP2, target_query_GRP2)
|
245
|
+
for src_query_with_rules, tgt_query_with_rules in zip(src_agg_queries, tgt_agg_queries):
|
246
|
+
# For each Aggregate query, read the Source and Target Data and add a hash column
|
247
|
+
|
248
|
+
rules_reconcile_output: list[AggregateQueryOutput] = []
|
249
|
+
src_data = None
|
250
|
+
tgt_data = None
|
251
|
+
joined_df = None
|
252
|
+
data_source_exception = None
|
253
|
+
try:
|
254
|
+
src_data = self._source.read_data(
|
255
|
+
catalog=self._database_config.source_catalog,
|
256
|
+
schema=self._database_config.source_schema,
|
257
|
+
table=table_conf.source_name,
|
258
|
+
query=src_query_with_rules.query,
|
259
|
+
options=table_conf.jdbc_reader_options,
|
260
|
+
)
|
261
|
+
tgt_data = self._target.read_data(
|
262
|
+
catalog=self._database_config.target_catalog,
|
263
|
+
schema=self._database_config.target_schema,
|
264
|
+
table=table_conf.target_name,
|
265
|
+
query=tgt_query_with_rules.query,
|
266
|
+
options=table_conf.jdbc_reader_options,
|
267
|
+
)
|
268
|
+
# Join the Source and Target Aggregated data
|
269
|
+
joined_df = join_aggregate_data(
|
270
|
+
source=src_data,
|
271
|
+
target=tgt_data,
|
272
|
+
key_columns=src_query_with_rules.group_by_columns,
|
273
|
+
spark=self._spark,
|
274
|
+
path=f"{volume_path}{src_query_with_rules.group_by_columns_as_str}",
|
275
|
+
)
|
276
|
+
except DataSourceRuntimeException as e:
|
277
|
+
data_source_exception = e
|
278
|
+
|
279
|
+
# For each Aggregated Query, reconcile the data based on the rule
|
280
|
+
for rule in src_query_with_rules.rules:
|
281
|
+
if data_source_exception:
|
282
|
+
rule_reconcile_output = DataReconcileOutput(exception=str(data_source_exception))
|
283
|
+
else:
|
284
|
+
rule_reconcile_output = reconcile_agg_data_per_rule(
|
285
|
+
joined_df, src_data.columns, tgt_data.columns, rule
|
286
|
+
)
|
287
|
+
rules_reconcile_output.append(AggregateQueryOutput(rule=rule, reconcile_output=rule_reconcile_output))
|
288
|
+
|
289
|
+
# For each table, there could be many Aggregated queries.
|
290
|
+
# Collect the list of Rule Reconcile output per each Aggregate query and append it to the list
|
291
|
+
table_agg_output.extend(rules_reconcile_output)
|
292
|
+
return table_agg_output
|
293
|
+
|
294
|
+
def _get_sample_data(
|
295
|
+
self,
|
296
|
+
table_conf,
|
297
|
+
reconcile_output,
|
298
|
+
src_schema,
|
299
|
+
tgt_schema,
|
300
|
+
):
|
301
|
+
mismatch = None
|
302
|
+
missing_in_src = None
|
303
|
+
missing_in_tgt = None
|
304
|
+
|
305
|
+
if (
|
306
|
+
reconcile_output.mismatch_count > 0
|
307
|
+
or reconcile_output.missing_in_src_count > 0
|
308
|
+
or reconcile_output.missing_in_tgt_count > 0
|
309
|
+
):
|
310
|
+
src_sampler = SamplingQueryBuilder(table_conf, src_schema, "source", self._source_engine, self._source)
|
311
|
+
tgt_sampler = SamplingQueryBuilder(table_conf, tgt_schema, "target", self._target_engine, self._target)
|
312
|
+
if reconcile_output.mismatch_count > 0:
|
313
|
+
mismatch = self._get_mismatch_data(
|
314
|
+
src_sampler,
|
315
|
+
tgt_sampler,
|
316
|
+
reconcile_output.mismatch_count,
|
317
|
+
reconcile_output.mismatch.mismatch_df,
|
318
|
+
table_conf.join_columns,
|
319
|
+
table_conf.source_name,
|
320
|
+
table_conf.target_name,
|
321
|
+
table_conf.sampling_options,
|
322
|
+
)
|
323
|
+
|
324
|
+
if reconcile_output.missing_in_src_count > 0:
|
325
|
+
missing_in_src = Reconciliation._get_missing_data(
|
326
|
+
self._target,
|
327
|
+
tgt_sampler,
|
328
|
+
reconcile_output.missing_in_src,
|
329
|
+
self._database_config.target_catalog,
|
330
|
+
self._database_config.target_schema,
|
331
|
+
table_conf.target_name,
|
332
|
+
)
|
333
|
+
|
334
|
+
if reconcile_output.missing_in_tgt_count > 0:
|
335
|
+
missing_in_tgt = Reconciliation._get_missing_data(
|
336
|
+
self._source,
|
337
|
+
src_sampler,
|
338
|
+
reconcile_output.missing_in_tgt,
|
339
|
+
self._database_config.source_catalog,
|
340
|
+
self._database_config.source_schema,
|
341
|
+
table_conf.source_name,
|
342
|
+
)
|
343
|
+
|
344
|
+
return DataReconcileOutput(
|
345
|
+
mismatch=mismatch,
|
346
|
+
mismatch_count=reconcile_output.mismatch_count,
|
347
|
+
missing_in_src_count=reconcile_output.missing_in_src_count,
|
348
|
+
missing_in_tgt_count=reconcile_output.missing_in_tgt_count,
|
349
|
+
missing_in_src=missing_in_src,
|
350
|
+
missing_in_tgt=missing_in_tgt,
|
351
|
+
)
|
352
|
+
|
353
|
+
def _get_mismatch_data(
|
354
|
+
self,
|
355
|
+
src_sampler,
|
356
|
+
tgt_sampler,
|
357
|
+
mismatch_count,
|
358
|
+
mismatch,
|
359
|
+
key_columns,
|
360
|
+
src_table: str,
|
361
|
+
tgt_table: str,
|
362
|
+
sampling_options: SamplingOptions,
|
363
|
+
):
|
364
|
+
|
365
|
+
tgt_sampling_query = tgt_sampler.build_query_with_alias()
|
366
|
+
|
367
|
+
sampling_model_target = self._target.read_data(
|
368
|
+
catalog=self._database_config.target_catalog,
|
369
|
+
schema=self._database_config.target_schema,
|
370
|
+
table=tgt_table,
|
371
|
+
query=tgt_sampling_query,
|
372
|
+
options=None,
|
373
|
+
)
|
374
|
+
|
375
|
+
# Uses pre-calculated `mismatch_count` from `reconcile_output.mismatch_count` to avoid from recomputing `mismatch` for RandomSampler.
|
376
|
+
mismatch_sampler = SamplerFactory.get_sampler(sampling_options)
|
377
|
+
df = mismatch_sampler.sample(mismatch, mismatch_count, key_columns, sampling_model_target).cache()
|
378
|
+
|
379
|
+
src_mismatch_sample_query = src_sampler.build_query(df)
|
380
|
+
tgt_mismatch_sample_query = tgt_sampler.build_query(df)
|
381
|
+
|
382
|
+
src_data = self._source.read_data(
|
383
|
+
catalog=self._database_config.source_catalog,
|
384
|
+
schema=self._database_config.source_schema,
|
385
|
+
table=src_table,
|
386
|
+
query=src_mismatch_sample_query,
|
387
|
+
options=None,
|
388
|
+
)
|
389
|
+
tgt_data = self._target.read_data(
|
390
|
+
catalog=self._database_config.target_catalog,
|
391
|
+
schema=self._database_config.target_schema,
|
392
|
+
table=tgt_table,
|
393
|
+
query=tgt_mismatch_sample_query,
|
394
|
+
options=None,
|
395
|
+
)
|
396
|
+
|
397
|
+
return capture_mismatch_data_and_columns(source=src_data, target=tgt_data, key_columns=key_columns)
|
398
|
+
|
399
|
+
def _reconcile_threshold_data(
|
400
|
+
self,
|
401
|
+
table_conf: Table,
|
402
|
+
src_schema: list[Schema],
|
403
|
+
tgt_schema: list[Schema],
|
404
|
+
):
|
405
|
+
|
406
|
+
src_data, tgt_data = self._get_threshold_data(table_conf, src_schema, tgt_schema)
|
407
|
+
|
408
|
+
source_view = f"source_{table_conf.source_name}_df_threshold_vw"
|
409
|
+
target_view = f"target_{table_conf.target_name}_df_threshold_vw"
|
410
|
+
|
411
|
+
src_data.createOrReplaceTempView(source_view)
|
412
|
+
tgt_data.createOrReplaceTempView(target_view)
|
413
|
+
|
414
|
+
return self._compute_threshold_comparison(table_conf, src_schema)
|
415
|
+
|
416
|
+
def _get_threshold_data(
|
417
|
+
self,
|
418
|
+
table_conf: Table,
|
419
|
+
src_schema: list[Schema],
|
420
|
+
tgt_schema: list[Schema],
|
421
|
+
) -> tuple[DataFrame, DataFrame]:
|
422
|
+
src_threshold_query = ThresholdQueryBuilder(
|
423
|
+
table_conf, src_schema, "source", self._source_engine, self._source
|
424
|
+
).build_threshold_query()
|
425
|
+
tgt_threshold_query = ThresholdQueryBuilder(
|
426
|
+
table_conf, tgt_schema, "target", self._target_engine, self._target
|
427
|
+
).build_threshold_query()
|
428
|
+
|
429
|
+
src_data = self._source.read_data(
|
430
|
+
catalog=self._database_config.source_catalog,
|
431
|
+
schema=self._database_config.source_schema,
|
432
|
+
table=table_conf.source_name,
|
433
|
+
query=src_threshold_query,
|
434
|
+
options=table_conf.jdbc_reader_options,
|
435
|
+
)
|
436
|
+
tgt_data = self._target.read_data(
|
437
|
+
catalog=self._database_config.target_catalog,
|
438
|
+
schema=self._database_config.target_schema,
|
439
|
+
table=table_conf.target_name,
|
440
|
+
query=tgt_threshold_query,
|
441
|
+
options=table_conf.jdbc_reader_options,
|
442
|
+
)
|
443
|
+
|
444
|
+
return src_data, tgt_data
|
445
|
+
|
446
|
+
def _compute_threshold_comparison(self, table_conf: Table, src_schema: list[Schema]) -> ThresholdOutput:
|
447
|
+
threshold_comparison_query = ThresholdQueryBuilder(
|
448
|
+
table_conf, src_schema, "target", self._target_engine, self._target
|
449
|
+
).build_comparison_query()
|
450
|
+
|
451
|
+
threshold_result = self._target.read_data(
|
452
|
+
catalog=self._database_config.target_catalog,
|
453
|
+
schema=self._database_config.target_schema,
|
454
|
+
table=table_conf.target_name,
|
455
|
+
query=threshold_comparison_query,
|
456
|
+
options=table_conf.jdbc_reader_options,
|
457
|
+
)
|
458
|
+
threshold_columns = table_conf.get_threshold_columns("source")
|
459
|
+
failed_where_cond = " OR ".join(
|
460
|
+
["`" + DialectUtils.unnormalize_identifier(name) + "_match` = 'Failed'" for name in threshold_columns]
|
461
|
+
)
|
462
|
+
mismatched_df = threshold_result.filter(failed_where_cond)
|
463
|
+
mismatched_count = mismatched_df.count()
|
464
|
+
threshold_df = None
|
465
|
+
if mismatched_count > 0:
|
466
|
+
threshold_df = mismatched_df.limit(_SAMPLE_ROWS)
|
467
|
+
|
468
|
+
return ThresholdOutput(threshold_df=threshold_df, threshold_mismatch_count=mismatched_count)
|
469
|
+
|
470
|
+
def get_record_count(self, table_conf: Table, report_type: str) -> ReconcileRecordCount:
|
471
|
+
if report_type != "schema":
|
472
|
+
source_count_query = CountQueryBuilder(table_conf, "source", self._source_engine).build_query()
|
473
|
+
target_count_query = CountQueryBuilder(table_conf, "target", self._target_engine).build_query()
|
474
|
+
source_count_row = self._source.read_data(
|
475
|
+
catalog=self._database_config.source_catalog,
|
476
|
+
schema=self._database_config.source_schema,
|
477
|
+
table=table_conf.source_name,
|
478
|
+
query=source_count_query,
|
479
|
+
options=None,
|
480
|
+
).first()
|
481
|
+
target_count_row = self._target.read_data(
|
482
|
+
catalog=self._database_config.target_catalog,
|
483
|
+
schema=self._database_config.target_schema,
|
484
|
+
table=table_conf.target_name,
|
485
|
+
query=target_count_query,
|
486
|
+
options=None,
|
487
|
+
).first()
|
488
|
+
|
489
|
+
source_count = int(source_count_row[0]) if source_count_row is not None else 0
|
490
|
+
target_count = int(target_count_row[0]) if target_count_row is not None else 0
|
491
|
+
|
492
|
+
return ReconcileRecordCount(source=int(source_count), target=int(target_count))
|
493
|
+
return ReconcileRecordCount()
|
494
|
+
|
495
|
+
@staticmethod
|
496
|
+
def _get_missing_data(
|
497
|
+
reader: DataSource,
|
498
|
+
sampler: SamplingQueryBuilder,
|
499
|
+
missing_df: DataFrame,
|
500
|
+
catalog: str,
|
501
|
+
schema: str,
|
502
|
+
table_name: str,
|
503
|
+
) -> DataFrame:
|
504
|
+
sample_query = sampler.build_query(missing_df)
|
505
|
+
return reader.read_data(
|
506
|
+
catalog=catalog,
|
507
|
+
schema=schema,
|
508
|
+
table=table_name,
|
509
|
+
query=sample_query,
|
510
|
+
options=None,
|
511
|
+
)
|
@@ -1,10 +1,10 @@
|
|
1
1
|
import logging
|
2
|
-
from dataclasses import asdict
|
3
2
|
|
4
3
|
from pyspark.sql import DataFrame, SparkSession
|
5
4
|
from pyspark.sql.types import BooleanType, StringType, StructField, StructType
|
6
5
|
from sqlglot import Dialect, parse_one
|
7
6
|
|
7
|
+
from databricks.labs.lakebridge.reconcile.connectors.dialect_utils import DialectUtils
|
8
8
|
from databricks.labs.lakebridge.transpiler.sqlglot.dialect_utils import get_dialect
|
9
9
|
from databricks.labs.lakebridge.reconcile.recon_config import Schema, Table
|
10
10
|
from databricks.labs.lakebridge.reconcile.recon_output_config import SchemaMatchResult, SchemaReconcileOutput
|
@@ -20,8 +20,7 @@ class SchemaCompare:
|
|
20
20
|
):
|
21
21
|
self.spark = spark
|
22
22
|
|
23
|
-
|
24
|
-
_schema_compare_schema: StructType = StructType(
|
23
|
+
_schema_compare_output_schema: StructType = StructType(
|
25
24
|
[
|
26
25
|
StructField("source_column", StringType(), False),
|
27
26
|
StructField("source_datatype", StringType(), False),
|
@@ -47,14 +46,16 @@ class SchemaCompare:
|
|
47
46
|
target_column_map = table_conf.to_src_col_map or {}
|
48
47
|
master_schema_match_res = [
|
49
48
|
SchemaMatchResult(
|
50
|
-
|
51
|
-
|
49
|
+
source_column_normalized=s.source_normalized_column_name,
|
50
|
+
source_column_normalized_ansi=s.ansi_normalized_column_name,
|
52
51
|
source_datatype=s.data_type,
|
52
|
+
databricks_column=target_column_map.get(s.ansi_normalized_column_name, s.ansi_normalized_column_name),
|
53
53
|
databricks_datatype=next(
|
54
54
|
(
|
55
55
|
tgt.data_type
|
56
56
|
for tgt in databricks_schema
|
57
|
-
if tgt.
|
57
|
+
if tgt.ansi_normalized_column_name
|
58
|
+
== target_column_map.get(s.ansi_normalized_column_name, s.ansi_normalized_column_name)
|
58
59
|
),
|
59
60
|
"",
|
60
61
|
),
|
@@ -63,16 +64,22 @@ class SchemaCompare:
|
|
63
64
|
]
|
64
65
|
return master_schema_match_res
|
65
66
|
|
66
|
-
def
|
67
|
-
"""
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
67
|
+
def _create_output_dataframe(self, data: list[SchemaMatchResult], schema: StructType) -> DataFrame:
|
68
|
+
"""Return a user-friendly dataframe for schema compare result."""
|
69
|
+
transformed = []
|
70
|
+
for item in data:
|
71
|
+
output = tuple(
|
72
|
+
[
|
73
|
+
DialectUtils.unnormalize_identifier(item.source_column_normalized_ansi),
|
74
|
+
item.source_datatype,
|
75
|
+
DialectUtils.unnormalize_identifier(item.databricks_column),
|
76
|
+
item.databricks_datatype,
|
77
|
+
item.is_valid,
|
78
|
+
]
|
79
|
+
)
|
80
|
+
transformed.append(output)
|
74
81
|
|
75
|
-
return
|
82
|
+
return self.spark.createDataFrame(transformed, schema)
|
76
83
|
|
77
84
|
@classmethod
|
78
85
|
def _parse(cls, source: Dialect, column: str, data_type: str) -> str:
|
@@ -88,10 +95,10 @@ class SchemaCompare:
|
|
88
95
|
|
89
96
|
@classmethod
|
90
97
|
def _validate_parsed_query(cls, master: SchemaMatchResult, parsed_query) -> None:
|
91
|
-
databricks_query = f"create table dummy ({master.
|
98
|
+
databricks_query = f"create table dummy ({master.source_column_normalized_ansi} {master.databricks_datatype})"
|
92
99
|
logger.info(
|
93
100
|
f"""
|
94
|
-
Source datatype: create table dummy ({master.
|
101
|
+
Source datatype: create table dummy ({master.source_column_normalized} {master.source_datatype})
|
95
102
|
Parse datatype: {parsed_query}
|
96
103
|
Databricks datatype: {databricks_query}
|
97
104
|
"""
|
@@ -116,11 +123,11 @@ class SchemaCompare:
|
|
116
123
|
master_schema = self._build_master_schema(source_schema, databricks_schema, table_conf)
|
117
124
|
for master in master_schema:
|
118
125
|
if not isinstance(source, Databricks):
|
119
|
-
parsed_query = self._parse(source, master.
|
126
|
+
parsed_query = self._parse(source, master.source_column_normalized, master.source_datatype)
|
120
127
|
self._validate_parsed_query(master, parsed_query)
|
121
128
|
elif master.source_datatype.lower() != master.databricks_datatype.lower():
|
122
129
|
master.is_valid = False
|
123
130
|
|
124
|
-
df = self.
|
131
|
+
df = self._create_output_dataframe(master_schema, self._schema_compare_output_schema)
|
125
132
|
final_result = self._table_schema_status(master_schema)
|
126
133
|
return SchemaReconcileOutput(final_result, df)
|
@@ -0,0 +1,78 @@
|
|
1
|
+
from datetime import datetime
|
2
|
+
|
3
|
+
from pyspark.sql import SparkSession
|
4
|
+
from databricks.sdk import WorkspaceClient
|
5
|
+
|
6
|
+
from databricks.labs.lakebridge.config import ReconcileConfig, TableRecon
|
7
|
+
from databricks.labs.lakebridge.reconcile import utils
|
8
|
+
from databricks.labs.lakebridge.reconcile.exception import DataSourceRuntimeException, ReconciliationException
|
9
|
+
from databricks.labs.lakebridge.reconcile.recon_capture import (
|
10
|
+
ReconIntermediatePersist,
|
11
|
+
generate_final_reconcile_aggregate_output,
|
12
|
+
)
|
13
|
+
from databricks.labs.lakebridge.reconcile.recon_config import AGG_RECONCILE_OPERATION_NAME
|
14
|
+
from databricks.labs.lakebridge.reconcile.recon_output_config import (
|
15
|
+
ReconcileProcessDuration,
|
16
|
+
AggregateQueryOutput,
|
17
|
+
DataReconcileOutput,
|
18
|
+
)
|
19
|
+
from databricks.labs.lakebridge.reconcile.trigger_recon_service import TriggerReconService
|
20
|
+
|
21
|
+
|
22
|
+
class TriggerReconAggregateService:
|
23
|
+
@staticmethod
|
24
|
+
def trigger_recon_aggregates(
|
25
|
+
ws: WorkspaceClient,
|
26
|
+
spark: SparkSession,
|
27
|
+
table_recon: TableRecon,
|
28
|
+
reconcile_config: ReconcileConfig,
|
29
|
+
local_test_run: bool = False,
|
30
|
+
):
|
31
|
+
reconciler, recon_capture = TriggerReconService.create_recon_dependencies(
|
32
|
+
ws, spark, reconcile_config, local_test_run
|
33
|
+
)
|
34
|
+
|
35
|
+
# Get the Aggregated Reconciliation Output for each table
|
36
|
+
for table_conf in table_recon.tables:
|
37
|
+
recon_process_duration = ReconcileProcessDuration(start_ts=str(datetime.now()), end_ts=None)
|
38
|
+
try:
|
39
|
+
src_schema, tgt_schema = TriggerReconService.get_schemas(
|
40
|
+
reconciler.source, reconciler.target, table_conf, reconcile_config.database_config, False
|
41
|
+
)
|
42
|
+
except DataSourceRuntimeException as e:
|
43
|
+
raise ReconciliationException(message=str(e)) from e
|
44
|
+
|
45
|
+
assert table_conf.aggregates, "Aggregates must be defined for Aggregates Reconciliation"
|
46
|
+
|
47
|
+
try:
|
48
|
+
table_reconcile_agg_output_list = reconciler.reconcile_aggregates(table_conf, src_schema, tgt_schema)
|
49
|
+
except DataSourceRuntimeException as e:
|
50
|
+
table_reconcile_agg_output_list = [
|
51
|
+
AggregateQueryOutput(reconcile_output=DataReconcileOutput(exception=str(e)), rule=None)
|
52
|
+
]
|
53
|
+
|
54
|
+
recon_process_duration.end_ts = str(datetime.now())
|
55
|
+
|
56
|
+
# Persist the data to the delta tables
|
57
|
+
recon_capture.store_aggregates_metrics(
|
58
|
+
reconcile_agg_output_list=table_reconcile_agg_output_list,
|
59
|
+
table_conf=table_conf,
|
60
|
+
recon_process_duration=recon_process_duration,
|
61
|
+
)
|
62
|
+
|
63
|
+
(
|
64
|
+
ReconIntermediatePersist(
|
65
|
+
spark=spark,
|
66
|
+
path=utils.generate_volume_path(table_conf, reconcile_config.metadata_config),
|
67
|
+
).clean_unmatched_df_from_volume()
|
68
|
+
)
|
69
|
+
|
70
|
+
return TriggerReconService.verify_successful_reconciliation(
|
71
|
+
generate_final_reconcile_aggregate_output(
|
72
|
+
recon_id=recon_capture.recon_id,
|
73
|
+
spark=spark,
|
74
|
+
metadata_config=reconcile_config.metadata_config,
|
75
|
+
local_test_run=local_test_run,
|
76
|
+
),
|
77
|
+
operation_name=AGG_RECONCILE_OPERATION_NAME,
|
78
|
+
)
|