databricks-labs-lakebridge 0.10.6__py3-none-any.whl → 0.10.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. databricks/labs/lakebridge/__about__.py +1 -1
  2. databricks/labs/lakebridge/analyzer/__init__.py +0 -0
  3. databricks/labs/lakebridge/analyzer/lakebridge_analyzer.py +95 -0
  4. databricks/labs/lakebridge/assessments/profiler_validator.py +103 -0
  5. databricks/labs/lakebridge/base_install.py +20 -3
  6. databricks/labs/lakebridge/cli.py +32 -59
  7. databricks/labs/lakebridge/contexts/application.py +7 -0
  8. databricks/labs/lakebridge/deployment/job.py +2 -2
  9. databricks/labs/lakebridge/helpers/file_utils.py +36 -0
  10. databricks/labs/lakebridge/helpers/validation.py +5 -3
  11. databricks/labs/lakebridge/install.py +73 -484
  12. databricks/labs/lakebridge/reconcile/compare.py +70 -33
  13. databricks/labs/lakebridge/reconcile/connectors/data_source.py +24 -1
  14. databricks/labs/lakebridge/reconcile/connectors/databricks.py +12 -1
  15. databricks/labs/lakebridge/reconcile/connectors/dialect_utils.py +126 -0
  16. databricks/labs/lakebridge/reconcile/connectors/models.py +7 -0
  17. databricks/labs/lakebridge/reconcile/connectors/oracle.py +12 -1
  18. databricks/labs/lakebridge/reconcile/connectors/secrets.py +19 -1
  19. databricks/labs/lakebridge/reconcile/connectors/snowflake.py +63 -30
  20. databricks/labs/lakebridge/reconcile/connectors/tsql.py +28 -2
  21. databricks/labs/lakebridge/reconcile/constants.py +4 -3
  22. databricks/labs/lakebridge/reconcile/execute.py +9 -810
  23. databricks/labs/lakebridge/reconcile/normalize_recon_config_service.py +133 -0
  24. databricks/labs/lakebridge/reconcile/query_builder/base.py +53 -18
  25. databricks/labs/lakebridge/reconcile/query_builder/expression_generator.py +8 -2
  26. databricks/labs/lakebridge/reconcile/query_builder/hash_query.py +7 -13
  27. databricks/labs/lakebridge/reconcile/query_builder/sampling_query.py +18 -19
  28. databricks/labs/lakebridge/reconcile/query_builder/threshold_query.py +36 -15
  29. databricks/labs/lakebridge/reconcile/recon_config.py +3 -15
  30. databricks/labs/lakebridge/reconcile/recon_output_config.py +2 -1
  31. databricks/labs/lakebridge/reconcile/reconciliation.py +511 -0
  32. databricks/labs/lakebridge/reconcile/schema_compare.py +26 -19
  33. databricks/labs/lakebridge/reconcile/trigger_recon_aggregate_service.py +78 -0
  34. databricks/labs/lakebridge/reconcile/trigger_recon_service.py +256 -0
  35. databricks/labs/lakebridge/reconcile/utils.py +38 -0
  36. databricks/labs/lakebridge/transpiler/execute.py +34 -28
  37. databricks/labs/lakebridge/transpiler/installers.py +523 -0
  38. databricks/labs/lakebridge/transpiler/lsp/lsp_engine.py +47 -60
  39. databricks/labs/lakebridge/transpiler/sqlglot/dialect_utils.py +2 -0
  40. databricks/labs/lakebridge/transpiler/transpile_engine.py +0 -18
  41. {databricks_labs_lakebridge-0.10.6.dist-info → databricks_labs_lakebridge-0.10.8.dist-info}/METADATA +1 -1
  42. {databricks_labs_lakebridge-0.10.6.dist-info → databricks_labs_lakebridge-0.10.8.dist-info}/RECORD +46 -35
  43. {databricks_labs_lakebridge-0.10.6.dist-info → databricks_labs_lakebridge-0.10.8.dist-info}/WHEEL +0 -0
  44. {databricks_labs_lakebridge-0.10.6.dist-info → databricks_labs_lakebridge-0.10.8.dist-info}/entry_points.txt +0 -0
  45. {databricks_labs_lakebridge-0.10.6.dist-info → databricks_labs_lakebridge-0.10.8.dist-info}/licenses/LICENSE +0 -0
  46. {databricks_labs_lakebridge-0.10.6.dist-info → databricks_labs_lakebridge-0.10.8.dist-info}/licenses/NOTICE +0 -0
@@ -0,0 +1,511 @@
1
+ import logging
2
+
3
+ from pyspark.sql import DataFrame, SparkSession
4
+ from sqlglot import Dialect
5
+
6
+ from databricks.labs.lakebridge.config import (
7
+ DatabaseConfig,
8
+ ReconcileMetadataConfig,
9
+ )
10
+ from databricks.labs.lakebridge.reconcile import utils
11
+ from databricks.labs.lakebridge.reconcile.compare import (
12
+ capture_mismatch_data_and_columns,
13
+ reconcile_data,
14
+ join_aggregate_data,
15
+ reconcile_agg_data_per_rule,
16
+ )
17
+ from databricks.labs.lakebridge.reconcile.connectors.data_source import DataSource
18
+ from databricks.labs.lakebridge.reconcile.connectors.dialect_utils import DialectUtils
19
+ from databricks.labs.lakebridge.reconcile.exception import (
20
+ DataSourceRuntimeException,
21
+ )
22
+ from databricks.labs.lakebridge.reconcile.query_builder.aggregate_query import AggregateQueryBuilder
23
+ from databricks.labs.lakebridge.reconcile.query_builder.count_query import CountQueryBuilder
24
+ from databricks.labs.lakebridge.reconcile.query_builder.hash_query import HashQueryBuilder
25
+ from databricks.labs.lakebridge.reconcile.query_builder.sampling_query import (
26
+ SamplingQueryBuilder,
27
+ )
28
+ from databricks.labs.lakebridge.reconcile.query_builder.threshold_query import (
29
+ ThresholdQueryBuilder,
30
+ )
31
+ from databricks.labs.lakebridge.reconcile.recon_config import (
32
+ Schema,
33
+ Table,
34
+ AggregateQueryRules,
35
+ SamplingOptions,
36
+ )
37
+ from databricks.labs.lakebridge.reconcile.recon_output_config import (
38
+ DataReconcileOutput,
39
+ ThresholdOutput,
40
+ ReconcileRecordCount,
41
+ AggregateQueryOutput,
42
+ )
43
+ from databricks.labs.lakebridge.reconcile.sampler import SamplerFactory
44
+ from databricks.labs.lakebridge.reconcile.schema_compare import SchemaCompare
45
+ from databricks.labs.lakebridge.transpiler.sqlglot.dialect_utils import get_dialect
46
+
47
+ logger = logging.getLogger(__name__)
48
+ _SAMPLE_ROWS = 50
49
+
50
+
51
+ class Reconciliation:
52
+
53
+ def __init__(
54
+ self,
55
+ source: DataSource,
56
+ target: DataSource,
57
+ database_config: DatabaseConfig,
58
+ report_type: str,
59
+ schema_comparator: SchemaCompare,
60
+ source_engine: Dialect,
61
+ spark: SparkSession,
62
+ metadata_config: ReconcileMetadataConfig,
63
+ ):
64
+ self._source = source
65
+ self._target = target
66
+ self._report_type = report_type
67
+ self._database_config = database_config
68
+ self._schema_comparator = schema_comparator
69
+ self._target_engine = get_dialect("databricks")
70
+ self._source_engine = source_engine
71
+ self._spark = spark
72
+ self._metadata_config = metadata_config
73
+
74
+ @property
75
+ def source(self) -> DataSource:
76
+ return self._source
77
+
78
+ @property
79
+ def target(self) -> DataSource:
80
+ return self._target
81
+
82
+ @property
83
+ def report_type(self) -> str:
84
+ return self._report_type
85
+
86
+ def reconcile_data(
87
+ self,
88
+ table_conf: Table,
89
+ src_schema: list[Schema],
90
+ tgt_schema: list[Schema],
91
+ ) -> DataReconcileOutput:
92
+ data_reconcile_output = self._get_reconcile_output(table_conf, src_schema, tgt_schema)
93
+ reconcile_output = data_reconcile_output
94
+ if self._report_type in {"data", "all"}:
95
+ reconcile_output = self._get_sample_data(table_conf, data_reconcile_output, src_schema, tgt_schema)
96
+ if table_conf.get_threshold_columns("source"):
97
+ reconcile_output.threshold_output = self._reconcile_threshold_data(table_conf, src_schema, tgt_schema)
98
+
99
+ if self._report_type == "row" and table_conf.get_threshold_columns("source"):
100
+ logger.warning("Threshold comparison is ignored for 'row' report type")
101
+
102
+ return reconcile_output
103
+
104
+ def reconcile_schema(
105
+ self,
106
+ src_schema: list[Schema],
107
+ tgt_schema: list[Schema],
108
+ table_conf: Table,
109
+ ):
110
+ return self._schema_comparator.compare(src_schema, tgt_schema, self._source_engine, table_conf)
111
+
112
+ def reconcile_aggregates(
113
+ self,
114
+ table_conf: Table,
115
+ src_schema: list[Schema],
116
+ tgt_schema: list[Schema],
117
+ ) -> list[AggregateQueryOutput]:
118
+ return self._get_reconcile_aggregate_output(table_conf, src_schema, tgt_schema)
119
+
120
+ def _get_reconcile_output(
121
+ self,
122
+ table_conf,
123
+ src_schema,
124
+ tgt_schema,
125
+ ):
126
+ src_hash_query = HashQueryBuilder(
127
+ table_conf, src_schema, "source", self._source_engine, self._source
128
+ ).build_query(report_type=self._report_type)
129
+ tgt_hash_query = HashQueryBuilder(
130
+ table_conf, tgt_schema, "target", self._source_engine, self._target
131
+ ).build_query(report_type=self._report_type)
132
+ src_data = self._source.read_data(
133
+ catalog=self._database_config.source_catalog,
134
+ schema=self._database_config.source_schema,
135
+ table=table_conf.source_name,
136
+ query=src_hash_query,
137
+ options=table_conf.jdbc_reader_options,
138
+ )
139
+ tgt_data = self._target.read_data(
140
+ catalog=self._database_config.target_catalog,
141
+ schema=self._database_config.target_schema,
142
+ table=table_conf.target_name,
143
+ query=tgt_hash_query,
144
+ options=table_conf.jdbc_reader_options,
145
+ )
146
+
147
+ volume_path = utils.generate_volume_path(table_conf, self._metadata_config)
148
+ return reconcile_data(
149
+ source=src_data,
150
+ target=tgt_data,
151
+ key_columns=table_conf.join_columns,
152
+ report_type=self._report_type,
153
+ spark=self._spark,
154
+ path=volume_path,
155
+ )
156
+
157
+ def _get_reconcile_aggregate_output(
158
+ self,
159
+ table_conf,
160
+ src_schema,
161
+ tgt_schema,
162
+ ):
163
+ """
164
+ Creates a single Query, for the aggregates having the same group by columns. (Ex: 1)
165
+ If there are no group by columns, all the aggregates are clubbed together in a single query. (Ex: 2)
166
+ Examples:
167
+ 1. {
168
+ "type": "MIN",
169
+ "agg_cols": ["COL1"],
170
+ "group_by_cols": ["COL4"]
171
+ },
172
+ {
173
+ "type": "MAX",
174
+ "agg_cols": ["COL2"],
175
+ "group_by_cols": ["COL9"]
176
+ },
177
+ {
178
+ "type": "COUNT",
179
+ "agg_cols": ["COL2"],
180
+ "group_by_cols": ["COL9"]
181
+ },
182
+ {
183
+ "type": "AVG",
184
+ "agg_cols": ["COL3"],
185
+ "group_by_cols": ["COL4"]
186
+ },
187
+ Query 1: SELECT MIN(COL1), AVG(COL3) FROM :table GROUP BY COL4
188
+ Rules: ID | Aggregate Type | Column | Group By Column
189
+ #1, MIN, COL1, COL4
190
+ #2, AVG, COL3, COL4
191
+ -------------------------------------------------------
192
+ Query 2: SELECT MAX(COL2), COUNT(COL2) FROM :table GROUP BY COL9
193
+ Rules: ID | Aggregate Type | Column | Group By Column
194
+ #1, MAX, COL2, COL9
195
+ #2, COUNT, COL2, COL9
196
+ 2. {
197
+ "type": "MAX",
198
+ "agg_cols": ["COL1"]
199
+ },
200
+ {
201
+ "type": "SUM",
202
+ "agg_cols": ["COL2"]
203
+ },
204
+ {
205
+ "type": "MAX",
206
+ "agg_cols": ["COL3"]
207
+ }
208
+ Query: SELECT MAX(COL1), SUM(COL2), MAX(COL3) FROM :table
209
+ Rules: ID | Aggregate Type | Column | Group By Column
210
+ #1, MAX, COL1,
211
+ #2, SUM, COL2,
212
+ #3, MAX, COL3,
213
+ """
214
+
215
+ src_query_builder = AggregateQueryBuilder(
216
+ table_conf,
217
+ src_schema,
218
+ "source",
219
+ self._source_engine,
220
+ self._source,
221
+ )
222
+
223
+ # build Aggregate queries for source,
224
+ src_agg_queries: list[AggregateQueryRules] = src_query_builder.build_queries()
225
+
226
+ # There could be one or more queries per table based on the group by columns
227
+
228
+ # build Aggregate queries for target(Databricks),
229
+ tgt_agg_queries: list[AggregateQueryRules] = AggregateQueryBuilder(
230
+ table_conf,
231
+ tgt_schema,
232
+ "target",
233
+ self._target_engine,
234
+ self._target,
235
+ ).build_queries()
236
+
237
+ volume_path = utils.generate_volume_path(table_conf, self._metadata_config)
238
+
239
+ table_agg_output: list[AggregateQueryOutput] = []
240
+
241
+ # Iterate over the grouped aggregates and reconcile the data
242
+ # Zip all the keys, read the source, target data for each Aggregate query
243
+ # and reconcile on the aggregate data
244
+ # For e.g., (source_query_GRP1, target_query_GRP1), (source_query_GRP2, target_query_GRP2)
245
+ for src_query_with_rules, tgt_query_with_rules in zip(src_agg_queries, tgt_agg_queries):
246
+ # For each Aggregate query, read the Source and Target Data and add a hash column
247
+
248
+ rules_reconcile_output: list[AggregateQueryOutput] = []
249
+ src_data = None
250
+ tgt_data = None
251
+ joined_df = None
252
+ data_source_exception = None
253
+ try:
254
+ src_data = self._source.read_data(
255
+ catalog=self._database_config.source_catalog,
256
+ schema=self._database_config.source_schema,
257
+ table=table_conf.source_name,
258
+ query=src_query_with_rules.query,
259
+ options=table_conf.jdbc_reader_options,
260
+ )
261
+ tgt_data = self._target.read_data(
262
+ catalog=self._database_config.target_catalog,
263
+ schema=self._database_config.target_schema,
264
+ table=table_conf.target_name,
265
+ query=tgt_query_with_rules.query,
266
+ options=table_conf.jdbc_reader_options,
267
+ )
268
+ # Join the Source and Target Aggregated data
269
+ joined_df = join_aggregate_data(
270
+ source=src_data,
271
+ target=tgt_data,
272
+ key_columns=src_query_with_rules.group_by_columns,
273
+ spark=self._spark,
274
+ path=f"{volume_path}{src_query_with_rules.group_by_columns_as_str}",
275
+ )
276
+ except DataSourceRuntimeException as e:
277
+ data_source_exception = e
278
+
279
+ # For each Aggregated Query, reconcile the data based on the rule
280
+ for rule in src_query_with_rules.rules:
281
+ if data_source_exception:
282
+ rule_reconcile_output = DataReconcileOutput(exception=str(data_source_exception))
283
+ else:
284
+ rule_reconcile_output = reconcile_agg_data_per_rule(
285
+ joined_df, src_data.columns, tgt_data.columns, rule
286
+ )
287
+ rules_reconcile_output.append(AggregateQueryOutput(rule=rule, reconcile_output=rule_reconcile_output))
288
+
289
+ # For each table, there could be many Aggregated queries.
290
+ # Collect the list of Rule Reconcile output per each Aggregate query and append it to the list
291
+ table_agg_output.extend(rules_reconcile_output)
292
+ return table_agg_output
293
+
294
+ def _get_sample_data(
295
+ self,
296
+ table_conf,
297
+ reconcile_output,
298
+ src_schema,
299
+ tgt_schema,
300
+ ):
301
+ mismatch = None
302
+ missing_in_src = None
303
+ missing_in_tgt = None
304
+
305
+ if (
306
+ reconcile_output.mismatch_count > 0
307
+ or reconcile_output.missing_in_src_count > 0
308
+ or reconcile_output.missing_in_tgt_count > 0
309
+ ):
310
+ src_sampler = SamplingQueryBuilder(table_conf, src_schema, "source", self._source_engine, self._source)
311
+ tgt_sampler = SamplingQueryBuilder(table_conf, tgt_schema, "target", self._target_engine, self._target)
312
+ if reconcile_output.mismatch_count > 0:
313
+ mismatch = self._get_mismatch_data(
314
+ src_sampler,
315
+ tgt_sampler,
316
+ reconcile_output.mismatch_count,
317
+ reconcile_output.mismatch.mismatch_df,
318
+ table_conf.join_columns,
319
+ table_conf.source_name,
320
+ table_conf.target_name,
321
+ table_conf.sampling_options,
322
+ )
323
+
324
+ if reconcile_output.missing_in_src_count > 0:
325
+ missing_in_src = Reconciliation._get_missing_data(
326
+ self._target,
327
+ tgt_sampler,
328
+ reconcile_output.missing_in_src,
329
+ self._database_config.target_catalog,
330
+ self._database_config.target_schema,
331
+ table_conf.target_name,
332
+ )
333
+
334
+ if reconcile_output.missing_in_tgt_count > 0:
335
+ missing_in_tgt = Reconciliation._get_missing_data(
336
+ self._source,
337
+ src_sampler,
338
+ reconcile_output.missing_in_tgt,
339
+ self._database_config.source_catalog,
340
+ self._database_config.source_schema,
341
+ table_conf.source_name,
342
+ )
343
+
344
+ return DataReconcileOutput(
345
+ mismatch=mismatch,
346
+ mismatch_count=reconcile_output.mismatch_count,
347
+ missing_in_src_count=reconcile_output.missing_in_src_count,
348
+ missing_in_tgt_count=reconcile_output.missing_in_tgt_count,
349
+ missing_in_src=missing_in_src,
350
+ missing_in_tgt=missing_in_tgt,
351
+ )
352
+
353
+ def _get_mismatch_data(
354
+ self,
355
+ src_sampler,
356
+ tgt_sampler,
357
+ mismatch_count,
358
+ mismatch,
359
+ key_columns,
360
+ src_table: str,
361
+ tgt_table: str,
362
+ sampling_options: SamplingOptions,
363
+ ):
364
+
365
+ tgt_sampling_query = tgt_sampler.build_query_with_alias()
366
+
367
+ sampling_model_target = self._target.read_data(
368
+ catalog=self._database_config.target_catalog,
369
+ schema=self._database_config.target_schema,
370
+ table=tgt_table,
371
+ query=tgt_sampling_query,
372
+ options=None,
373
+ )
374
+
375
+ # Uses pre-calculated `mismatch_count` from `reconcile_output.mismatch_count` to avoid from recomputing `mismatch` for RandomSampler.
376
+ mismatch_sampler = SamplerFactory.get_sampler(sampling_options)
377
+ df = mismatch_sampler.sample(mismatch, mismatch_count, key_columns, sampling_model_target).cache()
378
+
379
+ src_mismatch_sample_query = src_sampler.build_query(df)
380
+ tgt_mismatch_sample_query = tgt_sampler.build_query(df)
381
+
382
+ src_data = self._source.read_data(
383
+ catalog=self._database_config.source_catalog,
384
+ schema=self._database_config.source_schema,
385
+ table=src_table,
386
+ query=src_mismatch_sample_query,
387
+ options=None,
388
+ )
389
+ tgt_data = self._target.read_data(
390
+ catalog=self._database_config.target_catalog,
391
+ schema=self._database_config.target_schema,
392
+ table=tgt_table,
393
+ query=tgt_mismatch_sample_query,
394
+ options=None,
395
+ )
396
+
397
+ return capture_mismatch_data_and_columns(source=src_data, target=tgt_data, key_columns=key_columns)
398
+
399
+ def _reconcile_threshold_data(
400
+ self,
401
+ table_conf: Table,
402
+ src_schema: list[Schema],
403
+ tgt_schema: list[Schema],
404
+ ):
405
+
406
+ src_data, tgt_data = self._get_threshold_data(table_conf, src_schema, tgt_schema)
407
+
408
+ source_view = f"source_{table_conf.source_name}_df_threshold_vw"
409
+ target_view = f"target_{table_conf.target_name}_df_threshold_vw"
410
+
411
+ src_data.createOrReplaceTempView(source_view)
412
+ tgt_data.createOrReplaceTempView(target_view)
413
+
414
+ return self._compute_threshold_comparison(table_conf, src_schema)
415
+
416
+ def _get_threshold_data(
417
+ self,
418
+ table_conf: Table,
419
+ src_schema: list[Schema],
420
+ tgt_schema: list[Schema],
421
+ ) -> tuple[DataFrame, DataFrame]:
422
+ src_threshold_query = ThresholdQueryBuilder(
423
+ table_conf, src_schema, "source", self._source_engine, self._source
424
+ ).build_threshold_query()
425
+ tgt_threshold_query = ThresholdQueryBuilder(
426
+ table_conf, tgt_schema, "target", self._target_engine, self._target
427
+ ).build_threshold_query()
428
+
429
+ src_data = self._source.read_data(
430
+ catalog=self._database_config.source_catalog,
431
+ schema=self._database_config.source_schema,
432
+ table=table_conf.source_name,
433
+ query=src_threshold_query,
434
+ options=table_conf.jdbc_reader_options,
435
+ )
436
+ tgt_data = self._target.read_data(
437
+ catalog=self._database_config.target_catalog,
438
+ schema=self._database_config.target_schema,
439
+ table=table_conf.target_name,
440
+ query=tgt_threshold_query,
441
+ options=table_conf.jdbc_reader_options,
442
+ )
443
+
444
+ return src_data, tgt_data
445
+
446
+ def _compute_threshold_comparison(self, table_conf: Table, src_schema: list[Schema]) -> ThresholdOutput:
447
+ threshold_comparison_query = ThresholdQueryBuilder(
448
+ table_conf, src_schema, "target", self._target_engine, self._target
449
+ ).build_comparison_query()
450
+
451
+ threshold_result = self._target.read_data(
452
+ catalog=self._database_config.target_catalog,
453
+ schema=self._database_config.target_schema,
454
+ table=table_conf.target_name,
455
+ query=threshold_comparison_query,
456
+ options=table_conf.jdbc_reader_options,
457
+ )
458
+ threshold_columns = table_conf.get_threshold_columns("source")
459
+ failed_where_cond = " OR ".join(
460
+ ["`" + DialectUtils.unnormalize_identifier(name) + "_match` = 'Failed'" for name in threshold_columns]
461
+ )
462
+ mismatched_df = threshold_result.filter(failed_where_cond)
463
+ mismatched_count = mismatched_df.count()
464
+ threshold_df = None
465
+ if mismatched_count > 0:
466
+ threshold_df = mismatched_df.limit(_SAMPLE_ROWS)
467
+
468
+ return ThresholdOutput(threshold_df=threshold_df, threshold_mismatch_count=mismatched_count)
469
+
470
+ def get_record_count(self, table_conf: Table, report_type: str) -> ReconcileRecordCount:
471
+ if report_type != "schema":
472
+ source_count_query = CountQueryBuilder(table_conf, "source", self._source_engine).build_query()
473
+ target_count_query = CountQueryBuilder(table_conf, "target", self._target_engine).build_query()
474
+ source_count_row = self._source.read_data(
475
+ catalog=self._database_config.source_catalog,
476
+ schema=self._database_config.source_schema,
477
+ table=table_conf.source_name,
478
+ query=source_count_query,
479
+ options=None,
480
+ ).first()
481
+ target_count_row = self._target.read_data(
482
+ catalog=self._database_config.target_catalog,
483
+ schema=self._database_config.target_schema,
484
+ table=table_conf.target_name,
485
+ query=target_count_query,
486
+ options=None,
487
+ ).first()
488
+
489
+ source_count = int(source_count_row[0]) if source_count_row is not None else 0
490
+ target_count = int(target_count_row[0]) if target_count_row is not None else 0
491
+
492
+ return ReconcileRecordCount(source=int(source_count), target=int(target_count))
493
+ return ReconcileRecordCount()
494
+
495
+ @staticmethod
496
+ def _get_missing_data(
497
+ reader: DataSource,
498
+ sampler: SamplingQueryBuilder,
499
+ missing_df: DataFrame,
500
+ catalog: str,
501
+ schema: str,
502
+ table_name: str,
503
+ ) -> DataFrame:
504
+ sample_query = sampler.build_query(missing_df)
505
+ return reader.read_data(
506
+ catalog=catalog,
507
+ schema=schema,
508
+ table=table_name,
509
+ query=sample_query,
510
+ options=None,
511
+ )
@@ -1,10 +1,10 @@
1
1
  import logging
2
- from dataclasses import asdict
3
2
 
4
3
  from pyspark.sql import DataFrame, SparkSession
5
4
  from pyspark.sql.types import BooleanType, StringType, StructField, StructType
6
5
  from sqlglot import Dialect, parse_one
7
6
 
7
+ from databricks.labs.lakebridge.reconcile.connectors.dialect_utils import DialectUtils
8
8
  from databricks.labs.lakebridge.transpiler.sqlglot.dialect_utils import get_dialect
9
9
  from databricks.labs.lakebridge.reconcile.recon_config import Schema, Table
10
10
  from databricks.labs.lakebridge.reconcile.recon_output_config import SchemaMatchResult, SchemaReconcileOutput
@@ -20,8 +20,7 @@ class SchemaCompare:
20
20
  ):
21
21
  self.spark = spark
22
22
 
23
- # Define the schema for the schema compare DataFrame
24
- _schema_compare_schema: StructType = StructType(
23
+ _schema_compare_output_schema: StructType = StructType(
25
24
  [
26
25
  StructField("source_column", StringType(), False),
27
26
  StructField("source_datatype", StringType(), False),
@@ -47,14 +46,16 @@ class SchemaCompare:
47
46
  target_column_map = table_conf.to_src_col_map or {}
48
47
  master_schema_match_res = [
49
48
  SchemaMatchResult(
50
- source_column=s.column_name,
51
- databricks_column=target_column_map.get(s.column_name, s.column_name),
49
+ source_column_normalized=s.source_normalized_column_name,
50
+ source_column_normalized_ansi=s.ansi_normalized_column_name,
52
51
  source_datatype=s.data_type,
52
+ databricks_column=target_column_map.get(s.ansi_normalized_column_name, s.ansi_normalized_column_name),
53
53
  databricks_datatype=next(
54
54
  (
55
55
  tgt.data_type
56
56
  for tgt in databricks_schema
57
- if tgt.column_name == target_column_map.get(s.column_name, s.column_name)
57
+ if tgt.ansi_normalized_column_name
58
+ == target_column_map.get(s.ansi_normalized_column_name, s.ansi_normalized_column_name)
58
59
  ),
59
60
  "",
60
61
  ),
@@ -63,16 +64,22 @@ class SchemaCompare:
63
64
  ]
64
65
  return master_schema_match_res
65
66
 
66
- def _create_dataframe(self, data: list, schema: StructType) -> DataFrame:
67
- """
68
- :param data: Expectation is list of dataclass
69
- :param schema: Target schema
70
- :return: DataFrame
71
- """
72
- data = [tuple(asdict(item).values()) for item in data]
73
- df = self.spark.createDataFrame(data, schema)
67
+ def _create_output_dataframe(self, data: list[SchemaMatchResult], schema: StructType) -> DataFrame:
68
+ """Return a user-friendly dataframe for schema compare result."""
69
+ transformed = []
70
+ for item in data:
71
+ output = tuple(
72
+ [
73
+ DialectUtils.unnormalize_identifier(item.source_column_normalized_ansi),
74
+ item.source_datatype,
75
+ DialectUtils.unnormalize_identifier(item.databricks_column),
76
+ item.databricks_datatype,
77
+ item.is_valid,
78
+ ]
79
+ )
80
+ transformed.append(output)
74
81
 
75
- return df
82
+ return self.spark.createDataFrame(transformed, schema)
76
83
 
77
84
  @classmethod
78
85
  def _parse(cls, source: Dialect, column: str, data_type: str) -> str:
@@ -88,10 +95,10 @@ class SchemaCompare:
88
95
 
89
96
  @classmethod
90
97
  def _validate_parsed_query(cls, master: SchemaMatchResult, parsed_query) -> None:
91
- databricks_query = f"create table dummy ({master.source_column} {master.databricks_datatype})"
98
+ databricks_query = f"create table dummy ({master.source_column_normalized_ansi} {master.databricks_datatype})"
92
99
  logger.info(
93
100
  f"""
94
- Source datatype: create table dummy ({master.source_column} {master.source_datatype})
101
+ Source datatype: create table dummy ({master.source_column_normalized} {master.source_datatype})
95
102
  Parse datatype: {parsed_query}
96
103
  Databricks datatype: {databricks_query}
97
104
  """
@@ -116,11 +123,11 @@ class SchemaCompare:
116
123
  master_schema = self._build_master_schema(source_schema, databricks_schema, table_conf)
117
124
  for master in master_schema:
118
125
  if not isinstance(source, Databricks):
119
- parsed_query = self._parse(source, master.source_column, master.source_datatype)
126
+ parsed_query = self._parse(source, master.source_column_normalized, master.source_datatype)
120
127
  self._validate_parsed_query(master, parsed_query)
121
128
  elif master.source_datatype.lower() != master.databricks_datatype.lower():
122
129
  master.is_valid = False
123
130
 
124
- df = self._create_dataframe(master_schema, self._schema_compare_schema)
131
+ df = self._create_output_dataframe(master_schema, self._schema_compare_output_schema)
125
132
  final_result = self._table_schema_status(master_schema)
126
133
  return SchemaReconcileOutput(final_result, df)
@@ -0,0 +1,78 @@
1
+ from datetime import datetime
2
+
3
+ from pyspark.sql import SparkSession
4
+ from databricks.sdk import WorkspaceClient
5
+
6
+ from databricks.labs.lakebridge.config import ReconcileConfig, TableRecon
7
+ from databricks.labs.lakebridge.reconcile import utils
8
+ from databricks.labs.lakebridge.reconcile.exception import DataSourceRuntimeException, ReconciliationException
9
+ from databricks.labs.lakebridge.reconcile.recon_capture import (
10
+ ReconIntermediatePersist,
11
+ generate_final_reconcile_aggregate_output,
12
+ )
13
+ from databricks.labs.lakebridge.reconcile.recon_config import AGG_RECONCILE_OPERATION_NAME
14
+ from databricks.labs.lakebridge.reconcile.recon_output_config import (
15
+ ReconcileProcessDuration,
16
+ AggregateQueryOutput,
17
+ DataReconcileOutput,
18
+ )
19
+ from databricks.labs.lakebridge.reconcile.trigger_recon_service import TriggerReconService
20
+
21
+
22
+ class TriggerReconAggregateService:
23
+ @staticmethod
24
+ def trigger_recon_aggregates(
25
+ ws: WorkspaceClient,
26
+ spark: SparkSession,
27
+ table_recon: TableRecon,
28
+ reconcile_config: ReconcileConfig,
29
+ local_test_run: bool = False,
30
+ ):
31
+ reconciler, recon_capture = TriggerReconService.create_recon_dependencies(
32
+ ws, spark, reconcile_config, local_test_run
33
+ )
34
+
35
+ # Get the Aggregated Reconciliation Output for each table
36
+ for table_conf in table_recon.tables:
37
+ recon_process_duration = ReconcileProcessDuration(start_ts=str(datetime.now()), end_ts=None)
38
+ try:
39
+ src_schema, tgt_schema = TriggerReconService.get_schemas(
40
+ reconciler.source, reconciler.target, table_conf, reconcile_config.database_config, False
41
+ )
42
+ except DataSourceRuntimeException as e:
43
+ raise ReconciliationException(message=str(e)) from e
44
+
45
+ assert table_conf.aggregates, "Aggregates must be defined for Aggregates Reconciliation"
46
+
47
+ try:
48
+ table_reconcile_agg_output_list = reconciler.reconcile_aggregates(table_conf, src_schema, tgt_schema)
49
+ except DataSourceRuntimeException as e:
50
+ table_reconcile_agg_output_list = [
51
+ AggregateQueryOutput(reconcile_output=DataReconcileOutput(exception=str(e)), rule=None)
52
+ ]
53
+
54
+ recon_process_duration.end_ts = str(datetime.now())
55
+
56
+ # Persist the data to the delta tables
57
+ recon_capture.store_aggregates_metrics(
58
+ reconcile_agg_output_list=table_reconcile_agg_output_list,
59
+ table_conf=table_conf,
60
+ recon_process_duration=recon_process_duration,
61
+ )
62
+
63
+ (
64
+ ReconIntermediatePersist(
65
+ spark=spark,
66
+ path=utils.generate_volume_path(table_conf, reconcile_config.metadata_config),
67
+ ).clean_unmatched_df_from_volume()
68
+ )
69
+
70
+ return TriggerReconService.verify_successful_reconciliation(
71
+ generate_final_reconcile_aggregate_output(
72
+ recon_id=recon_capture.recon_id,
73
+ spark=spark,
74
+ metadata_config=reconcile_config.metadata_config,
75
+ local_test_run=local_test_run,
76
+ ),
77
+ operation_name=AGG_RECONCILE_OPERATION_NAME,
78
+ )