databricks-labs-lakebridge 0.10.6__py3-none-any.whl → 0.10.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. databricks/labs/lakebridge/__about__.py +1 -1
  2. databricks/labs/lakebridge/analyzer/__init__.py +0 -0
  3. databricks/labs/lakebridge/analyzer/lakebridge_analyzer.py +95 -0
  4. databricks/labs/lakebridge/base_install.py +24 -3
  5. databricks/labs/lakebridge/cli.py +19 -53
  6. databricks/labs/lakebridge/contexts/application.py +7 -0
  7. databricks/labs/lakebridge/deployment/job.py +2 -2
  8. databricks/labs/lakebridge/helpers/file_utils.py +36 -0
  9. databricks/labs/lakebridge/install.py +187 -157
  10. databricks/labs/lakebridge/reconcile/compare.py +70 -33
  11. databricks/labs/lakebridge/reconcile/connectors/data_source.py +19 -0
  12. databricks/labs/lakebridge/reconcile/connectors/databricks.py +11 -1
  13. databricks/labs/lakebridge/reconcile/connectors/dialect_utils.py +126 -0
  14. databricks/labs/lakebridge/reconcile/connectors/models.py +7 -0
  15. databricks/labs/lakebridge/reconcile/connectors/oracle.py +11 -1
  16. databricks/labs/lakebridge/reconcile/connectors/snowflake.py +14 -2
  17. databricks/labs/lakebridge/reconcile/connectors/tsql.py +27 -2
  18. databricks/labs/lakebridge/reconcile/constants.py +4 -3
  19. databricks/labs/lakebridge/reconcile/execute.py +9 -810
  20. databricks/labs/lakebridge/reconcile/normalize_recon_config_service.py +133 -0
  21. databricks/labs/lakebridge/reconcile/query_builder/base.py +3 -7
  22. databricks/labs/lakebridge/reconcile/recon_config.py +3 -0
  23. databricks/labs/lakebridge/reconcile/recon_output_config.py +2 -1
  24. databricks/labs/lakebridge/reconcile/reconciliation.py +508 -0
  25. databricks/labs/lakebridge/reconcile/schema_compare.py +26 -19
  26. databricks/labs/lakebridge/reconcile/trigger_recon_aggregate_service.py +98 -0
  27. databricks/labs/lakebridge/reconcile/trigger_recon_service.py +253 -0
  28. databricks/labs/lakebridge/reconcile/utils.py +38 -0
  29. databricks/labs/lakebridge/transpiler/lsp/lsp_engine.py +45 -60
  30. databricks/labs/lakebridge/transpiler/sqlglot/dialect_utils.py +2 -0
  31. databricks/labs/lakebridge/transpiler/transpile_engine.py +0 -18
  32. {databricks_labs_lakebridge-0.10.6.dist-info → databricks_labs_lakebridge-0.10.7.dist-info}/METADATA +1 -1
  33. {databricks_labs_lakebridge-0.10.6.dist-info → databricks_labs_lakebridge-0.10.7.dist-info}/RECORD +37 -28
  34. {databricks_labs_lakebridge-0.10.6.dist-info → databricks_labs_lakebridge-0.10.7.dist-info}/WHEEL +0 -0
  35. {databricks_labs_lakebridge-0.10.6.dist-info → databricks_labs_lakebridge-0.10.7.dist-info}/entry_points.txt +0 -0
  36. {databricks_labs_lakebridge-0.10.6.dist-info → databricks_labs_lakebridge-0.10.7.dist-info}/licenses/LICENSE +0 -0
  37. {databricks_labs_lakebridge-0.10.6.dist-info → databricks_labs_lakebridge-0.10.7.dist-info}/licenses/NOTICE +0 -0
@@ -1,81 +1,26 @@
1
1
  import logging
2
- import sys
3
2
  import os
4
- from datetime import datetime
5
- from uuid import uuid4
3
+ import sys
6
4
 
7
- from pyspark.errors import PySparkException
8
- from pyspark.sql import DataFrame, SparkSession
9
- from sqlglot import Dialect
5
+ from databricks.connect import DatabricksSession
6
+ from databricks.labs.blueprint.installation import Installation
7
+ from databricks.sdk import WorkspaceClient
10
8
 
11
9
  from databricks.labs.lakebridge.config import (
12
- DatabaseConfig,
13
10
  TableRecon,
14
11
  ReconcileConfig,
15
- ReconcileMetadataConfig,
16
12
  )
17
- from databricks.labs.lakebridge.transpiler.sqlglot.dialect_utils import get_dialect
18
- from databricks.labs.lakebridge.reconcile.compare import (
19
- capture_mismatch_data_and_columns,
20
- reconcile_data,
21
- join_aggregate_data,
22
- reconcile_agg_data_per_rule,
23
- )
24
- from databricks.labs.lakebridge.reconcile.connectors.data_source import DataSource
25
- from databricks.labs.lakebridge.reconcile.connectors.source_adapter import create_adapter
26
13
  from databricks.labs.lakebridge.reconcile.exception import (
27
- DataSourceRuntimeException,
28
- InvalidInputException,
29
14
  ReconciliationException,
30
15
  )
31
- from databricks.labs.lakebridge.reconcile.query_builder.aggregate_query import AggregateQueryBuilder
32
- from databricks.labs.lakebridge.reconcile.query_builder.count_query import CountQueryBuilder
33
- from databricks.labs.lakebridge.reconcile.query_builder.hash_query import HashQueryBuilder
34
- from databricks.labs.lakebridge.reconcile.query_builder.sampling_query import (
35
- SamplingQueryBuilder,
36
- )
37
- from databricks.labs.lakebridge.reconcile.query_builder.threshold_query import (
38
- ThresholdQueryBuilder,
39
- )
40
- from databricks.labs.lakebridge.reconcile.recon_capture import (
41
- ReconCapture,
42
- generate_final_reconcile_output,
43
- ReconIntermediatePersist,
44
- generate_final_reconcile_aggregate_output,
45
- )
16
+ from databricks.labs.lakebridge.reconcile.trigger_recon_aggregate_service import TriggerReconAggregateService
46
17
  from databricks.labs.lakebridge.reconcile.recon_config import (
47
- Schema,
48
- Table,
49
- AggregateQueryRules,
50
- SamplingOptions,
51
18
  RECONCILE_OPERATION_NAME,
52
19
  AGG_RECONCILE_OPERATION_NAME,
53
20
  )
54
- from databricks.labs.lakebridge.reconcile.recon_output_config import (
55
- DataReconcileOutput,
56
- ReconcileOutput,
57
- ReconcileProcessDuration,
58
- SchemaReconcileOutput,
59
- ThresholdOutput,
60
- ReconcileRecordCount,
61
- AggregateQueryOutput,
62
- )
63
- from databricks.labs.lakebridge.reconcile.sampler import SamplerFactory
64
- from databricks.labs.lakebridge.reconcile.schema_compare import SchemaCompare
65
- from databricks.labs.lakebridge.transpiler.execute import verify_workspace_client
66
- from databricks.sdk import WorkspaceClient
67
- from databricks.labs.blueprint.installation import Installation
68
- from databricks.connect import DatabricksSession
21
+ from databricks.labs.lakebridge.reconcile.trigger_recon_service import TriggerReconService
69
22
 
70
23
  logger = logging.getLogger(__name__)
71
- _SAMPLE_ROWS = 50
72
-
73
-
74
- def validate_input(input_value: str, list_of_value: set, message: str):
75
- if input_value not in list_of_value:
76
- error_message = f"{message} --> {input_value} is not one of {list_of_value}"
77
- logger.error(error_message)
78
- raise InvalidInputException(error_message)
79
24
 
80
25
 
81
26
  def main(*argv) -> None:
@@ -118,7 +63,7 @@ def _trigger_recon(
118
63
  reconcile_config: ReconcileConfig,
119
64
  ):
120
65
  try:
121
- recon_output = recon(
66
+ recon_output = TriggerReconService.trigger_recon(
122
67
  ws=w,
123
68
  spark=DatabricksSession.builder.getOrCreate(),
124
69
  table_recon=table_recon,
@@ -154,7 +99,8 @@ def _trigger_reconcile_aggregates(
154
99
  after logging the error details.
155
100
  """
156
101
  try:
157
- recon_output = reconcile_aggregates(
102
+ reconcile_config.report_type = "aggregate"
103
+ recon_output = TriggerReconAggregateService.trigger_recon_aggregates(
158
104
  ws=ws,
159
105
  spark=DatabricksSession.builder.getOrCreate(),
160
106
  table_recon=table_recon,
@@ -167,753 +113,6 @@ def _trigger_reconcile_aggregates(
167
113
  raise e
168
114
 
169
115
 
170
- def recon(
171
- ws: WorkspaceClient,
172
- spark: SparkSession,
173
- table_recon: TableRecon,
174
- reconcile_config: ReconcileConfig,
175
- local_test_run: bool = False,
176
- ) -> ReconcileOutput:
177
- """[EXPERIMENTAL] Reconcile the data between the source and target tables."""
178
- # verify the workspace client and add proper product and version details
179
- # TODO For now we are utilising the
180
- # verify_workspace_client from transpile/execute.py file. Later verify_workspace_client function has to be
181
- # refactored
182
-
183
- ws_client: WorkspaceClient = verify_workspace_client(ws)
184
-
185
- # validate the report type
186
- report_type = reconcile_config.report_type.lower()
187
- logger.info(f"report_type: {report_type}, data_source: {reconcile_config.data_source} ")
188
- validate_input(report_type, {"schema", "data", "row", "all"}, "Invalid report type")
189
-
190
- source, target = initialise_data_source(
191
- engine=get_dialect(reconcile_config.data_source),
192
- spark=spark,
193
- ws=ws_client,
194
- secret_scope=reconcile_config.secret_scope,
195
- )
196
-
197
- recon_id = str(uuid4())
198
- # initialise the Reconciliation
199
- reconciler = Reconciliation(
200
- source,
201
- target,
202
- reconcile_config.database_config,
203
- report_type,
204
- SchemaCompare(spark=spark),
205
- get_dialect(reconcile_config.data_source),
206
- spark,
207
- metadata_config=reconcile_config.metadata_config,
208
- )
209
-
210
- # initialise the recon capture class
211
- recon_capture = ReconCapture(
212
- database_config=reconcile_config.database_config,
213
- recon_id=recon_id,
214
- report_type=report_type,
215
- source_dialect=get_dialect(reconcile_config.data_source),
216
- ws=ws_client,
217
- spark=spark,
218
- metadata_config=reconcile_config.metadata_config,
219
- local_test_run=local_test_run,
220
- )
221
-
222
- for table_conf in table_recon.tables:
223
- recon_process_duration = ReconcileProcessDuration(start_ts=str(datetime.now()), end_ts=None)
224
- schema_reconcile_output = SchemaReconcileOutput(is_valid=True)
225
- data_reconcile_output = DataReconcileOutput()
226
- try:
227
- src_schema, tgt_schema = _get_schema(
228
- source=source, target=target, table_conf=table_conf, database_config=reconcile_config.database_config
229
- )
230
- except DataSourceRuntimeException as e:
231
- schema_reconcile_output = SchemaReconcileOutput(is_valid=False, exception=str(e))
232
- else:
233
- if report_type in {"schema", "all"}:
234
- schema_reconcile_output = _run_reconcile_schema(
235
- reconciler=reconciler, table_conf=table_conf, src_schema=src_schema, tgt_schema=tgt_schema
236
- )
237
- logger.warning("Schema comparison is completed.")
238
-
239
- if report_type in {"data", "row", "all"}:
240
- data_reconcile_output = _run_reconcile_data(
241
- reconciler=reconciler, table_conf=table_conf, src_schema=src_schema, tgt_schema=tgt_schema
242
- )
243
- logger.warning(f"Reconciliation for '{report_type}' report completed.")
244
-
245
- recon_process_duration.end_ts = str(datetime.now())
246
- # Persist the data to the delta tables
247
- recon_capture.start(
248
- data_reconcile_output=data_reconcile_output,
249
- schema_reconcile_output=schema_reconcile_output,
250
- table_conf=table_conf,
251
- recon_process_duration=recon_process_duration,
252
- record_count=reconciler.get_record_count(table_conf, report_type),
253
- )
254
- if report_type != "schema":
255
- ReconIntermediatePersist(
256
- spark=spark, path=generate_volume_path(table_conf, reconcile_config.metadata_config)
257
- ).clean_unmatched_df_from_volume()
258
-
259
- return _verify_successful_reconciliation(
260
- generate_final_reconcile_output(
261
- recon_id=recon_id,
262
- spark=spark,
263
- metadata_config=reconcile_config.metadata_config,
264
- local_test_run=local_test_run,
265
- )
266
- )
267
-
268
-
269
- def _verify_successful_reconciliation(
270
- reconcile_output: ReconcileOutput, operation_name: str = "reconcile"
271
- ) -> ReconcileOutput:
272
- for table_output in reconcile_output.results:
273
- if table_output.exception_message or (
274
- table_output.status.column is False
275
- or table_output.status.row is False
276
- or table_output.status.schema is False
277
- or table_output.status.aggregate is False
278
- ):
279
- raise ReconciliationException(
280
- f" Reconciliation failed for one or more tables. Please check the recon metrics for more details."
281
- f" **{operation_name}** failed.",
282
- reconcile_output=reconcile_output,
283
- )
284
-
285
- logger.info("Reconciliation completed successfully.")
286
- return reconcile_output
287
-
288
-
289
- def generate_volume_path(table_conf: Table, metadata_config: ReconcileMetadataConfig):
290
- catalog = metadata_config.catalog
291
- schema = metadata_config.schema
292
- return f"/Volumes/{catalog}/{schema}/{metadata_config.volume}/{table_conf.source_name}_{table_conf.target_name}/"
293
-
294
-
295
- def initialise_data_source(
296
- ws: WorkspaceClient,
297
- spark: SparkSession,
298
- engine: Dialect,
299
- secret_scope: str,
300
- ):
301
- source = create_adapter(engine=engine, spark=spark, ws=ws, secret_scope=secret_scope)
302
- target = create_adapter(engine=get_dialect("databricks"), spark=spark, ws=ws, secret_scope=secret_scope)
303
-
304
- return source, target
305
-
306
-
307
- def _get_missing_data(
308
- reader: DataSource,
309
- sampler: SamplingQueryBuilder,
310
- missing_df: DataFrame,
311
- catalog: str,
312
- schema: str,
313
- table_name: str,
314
- ) -> DataFrame:
315
- sample_query = sampler.build_query(missing_df)
316
- return reader.read_data(
317
- catalog=catalog,
318
- schema=schema,
319
- table=table_name,
320
- query=sample_query,
321
- options=None,
322
- )
323
-
324
-
325
- def reconcile_aggregates(
326
- ws: WorkspaceClient,
327
- spark: SparkSession,
328
- table_recon: TableRecon,
329
- reconcile_config: ReconcileConfig,
330
- local_test_run: bool = False,
331
- ):
332
- """[EXPERIMENTAL] Reconcile the aggregated data between the source and target tables.
333
- for e.g., COUNT, SUM, AVG of columns between source and target with or without any specific key/group by columns
334
- Supported Aggregate functions: MIN, MAX, COUNT, SUM, AVG, MEAN, MODE, PERCENTILE, STDDEV, VARIANCE, MEDIAN
335
- """
336
- # verify the workspace client and add proper product and version details
337
- # TODO For now we are utilising the
338
- # verify_workspace_client from transpile/execute.py file. Later verify_workspace_client function has to be
339
- # refactored
340
-
341
- ws_client: WorkspaceClient = verify_workspace_client(ws)
342
-
343
- report_type = ""
344
- if report_type:
345
- logger.info(f"report_type: {report_type}")
346
- logger.info(f"data_source: {reconcile_config.data_source}")
347
-
348
- # Read the reconcile_config and initialise the source and target data sources. Target is always Databricks
349
- source, target = initialise_data_source(
350
- engine=get_dialect(reconcile_config.data_source),
351
- spark=spark,
352
- ws=ws_client,
353
- secret_scope=reconcile_config.secret_scope,
354
- )
355
-
356
- # Generate Unique recon_id for every run
357
- recon_id = str(uuid4())
358
-
359
- # initialise the Reconciliation
360
- reconciler = Reconciliation(
361
- source,
362
- target,
363
- reconcile_config.database_config,
364
- report_type,
365
- SchemaCompare(spark=spark),
366
- get_dialect(reconcile_config.data_source),
367
- spark,
368
- metadata_config=reconcile_config.metadata_config,
369
- )
370
-
371
- # initialise the recon capture class
372
- recon_capture = ReconCapture(
373
- database_config=reconcile_config.database_config,
374
- recon_id=recon_id,
375
- report_type=report_type,
376
- source_dialect=get_dialect(reconcile_config.data_source),
377
- ws=ws_client,
378
- spark=spark,
379
- metadata_config=reconcile_config.metadata_config,
380
- local_test_run=local_test_run,
381
- )
382
-
383
- # Get the Aggregated Reconciliation Output for each table
384
- for table_conf in table_recon.tables:
385
- recon_process_duration = ReconcileProcessDuration(start_ts=str(datetime.now()), end_ts=None)
386
- try:
387
- src_schema, tgt_schema = _get_schema(
388
- source=source,
389
- target=target,
390
- table_conf=table_conf,
391
- database_config=reconcile_config.database_config,
392
- )
393
- except DataSourceRuntimeException as e:
394
- raise ReconciliationException(message=str(e)) from e
395
-
396
- assert table_conf.aggregates, "Aggregates must be defined for Aggregates Reconciliation"
397
-
398
- table_reconcile_agg_output_list: list[AggregateQueryOutput] = _run_reconcile_aggregates(
399
- reconciler=reconciler,
400
- table_conf=table_conf,
401
- src_schema=src_schema,
402
- tgt_schema=tgt_schema,
403
- )
404
-
405
- recon_process_duration.end_ts = str(datetime.now())
406
-
407
- # Persist the data to the delta tables
408
- recon_capture.store_aggregates_metrics(
409
- reconcile_agg_output_list=table_reconcile_agg_output_list,
410
- table_conf=table_conf,
411
- recon_process_duration=recon_process_duration,
412
- )
413
-
414
- (
415
- ReconIntermediatePersist(
416
- spark=spark,
417
- path=generate_volume_path(table_conf, reconcile_config.metadata_config),
418
- ).clean_unmatched_df_from_volume()
419
- )
420
-
421
- return _verify_successful_reconciliation(
422
- generate_final_reconcile_aggregate_output(
423
- recon_id=recon_id,
424
- spark=spark,
425
- metadata_config=reconcile_config.metadata_config,
426
- local_test_run=local_test_run,
427
- ),
428
- operation_name=AGG_RECONCILE_OPERATION_NAME,
429
- )
430
-
431
-
432
- class Reconciliation:
433
-
434
- def __init__(
435
- self,
436
- source: DataSource,
437
- target: DataSource,
438
- database_config: DatabaseConfig,
439
- report_type: str,
440
- schema_comparator: SchemaCompare,
441
- source_engine: Dialect,
442
- spark: SparkSession,
443
- metadata_config: ReconcileMetadataConfig,
444
- ):
445
- self._source = source
446
- self._target = target
447
- self._report_type = report_type
448
- self._database_config = database_config
449
- self._schema_comparator = schema_comparator
450
- self._target_engine = get_dialect("databricks")
451
- self._source_engine = source_engine
452
- self._spark = spark
453
- self._metadata_config = metadata_config
454
-
455
- def reconcile_data(
456
- self,
457
- table_conf: Table,
458
- src_schema: list[Schema],
459
- tgt_schema: list[Schema],
460
- ) -> DataReconcileOutput:
461
- data_reconcile_output = self._get_reconcile_output(table_conf, src_schema, tgt_schema)
462
- reconcile_output = data_reconcile_output
463
- if self._report_type in {"data", "all"}:
464
- reconcile_output = self._get_sample_data(table_conf, data_reconcile_output, src_schema, tgt_schema)
465
- if table_conf.get_threshold_columns("source"):
466
- reconcile_output.threshold_output = self._reconcile_threshold_data(table_conf, src_schema, tgt_schema)
467
-
468
- if self._report_type == "row" and table_conf.get_threshold_columns("source"):
469
- logger.warning("Threshold comparison is ignored for 'row' report type")
470
-
471
- return reconcile_output
472
-
473
- def reconcile_schema(
474
- self,
475
- src_schema: list[Schema],
476
- tgt_schema: list[Schema],
477
- table_conf: Table,
478
- ):
479
- return self._schema_comparator.compare(src_schema, tgt_schema, self._source_engine, table_conf)
480
-
481
- def reconcile_aggregates(
482
- self,
483
- table_conf: Table,
484
- src_schema: list[Schema],
485
- tgt_schema: list[Schema],
486
- ) -> list[AggregateQueryOutput]:
487
- return self._get_reconcile_aggregate_output(table_conf, src_schema, tgt_schema)
488
-
489
- def _get_reconcile_output(
490
- self,
491
- table_conf,
492
- src_schema,
493
- tgt_schema,
494
- ):
495
- src_hash_query = HashQueryBuilder(table_conf, src_schema, "source", self._source_engine).build_query(
496
- report_type=self._report_type
497
- )
498
- tgt_hash_query = HashQueryBuilder(table_conf, tgt_schema, "target", self._source_engine).build_query(
499
- report_type=self._report_type
500
- )
501
- src_data = self._source.read_data(
502
- catalog=self._database_config.source_catalog,
503
- schema=self._database_config.source_schema,
504
- table=table_conf.source_name,
505
- query=src_hash_query,
506
- options=table_conf.jdbc_reader_options,
507
- )
508
- tgt_data = self._target.read_data(
509
- catalog=self._database_config.target_catalog,
510
- schema=self._database_config.target_schema,
511
- table=table_conf.target_name,
512
- query=tgt_hash_query,
513
- options=table_conf.jdbc_reader_options,
514
- )
515
-
516
- volume_path = generate_volume_path(table_conf, self._metadata_config)
517
- return reconcile_data(
518
- source=src_data,
519
- target=tgt_data,
520
- key_columns=table_conf.join_columns,
521
- report_type=self._report_type,
522
- spark=self._spark,
523
- path=volume_path,
524
- )
525
-
526
- def _get_reconcile_aggregate_output(
527
- self,
528
- table_conf,
529
- src_schema,
530
- tgt_schema,
531
- ):
532
- """
533
- Creates a single Query, for the aggregates having the same group by columns. (Ex: 1)
534
- If there are no group by columns, all the aggregates are clubbed together in a single query. (Ex: 2)
535
- Examples:
536
- 1. {
537
- "type": "MIN",
538
- "agg_cols": ["COL1"],
539
- "group_by_cols": ["COL4"]
540
- },
541
- {
542
- "type": "MAX",
543
- "agg_cols": ["COL2"],
544
- "group_by_cols": ["COL9"]
545
- },
546
- {
547
- "type": "COUNT",
548
- "agg_cols": ["COL2"],
549
- "group_by_cols": ["COL9"]
550
- },
551
- {
552
- "type": "AVG",
553
- "agg_cols": ["COL3"],
554
- "group_by_cols": ["COL4"]
555
- },
556
- Query 1: SELECT MIN(COL1), AVG(COL3) FROM :table GROUP BY COL4
557
- Rules: ID | Aggregate Type | Column | Group By Column
558
- #1, MIN, COL1, COL4
559
- #2, AVG, COL3, COL4
560
- -------------------------------------------------------
561
- Query 2: SELECT MAX(COL2), COUNT(COL2) FROM :table GROUP BY COL9
562
- Rules: ID | Aggregate Type | Column | Group By Column
563
- #1, MAX, COL2, COL9
564
- #2, COUNT, COL2, COL9
565
- 2. {
566
- "type": "MAX",
567
- "agg_cols": ["COL1"]
568
- },
569
- {
570
- "type": "SUM",
571
- "agg_cols": ["COL2"]
572
- },
573
- {
574
- "type": "MAX",
575
- "agg_cols": ["COL3"]
576
- }
577
- Query: SELECT MAX(COL1), SUM(COL2), MAX(COL3) FROM :table
578
- Rules: ID | Aggregate Type | Column | Group By Column
579
- #1, MAX, COL1,
580
- #2, SUM, COL2,
581
- #3, MAX, COL3,
582
- """
583
-
584
- src_query_builder = AggregateQueryBuilder(
585
- table_conf,
586
- src_schema,
587
- "source",
588
- self._source_engine,
589
- )
590
-
591
- # build Aggregate queries for source,
592
- src_agg_queries: list[AggregateQueryRules] = src_query_builder.build_queries()
593
-
594
- # There could be one or more queries per table based on the group by columns
595
-
596
- # build Aggregate queries for target(Databricks),
597
- tgt_agg_queries: list[AggregateQueryRules] = AggregateQueryBuilder(
598
- table_conf,
599
- tgt_schema,
600
- "target",
601
- self._target_engine,
602
- ).build_queries()
603
-
604
- volume_path = generate_volume_path(table_conf, self._metadata_config)
605
-
606
- table_agg_output: list[AggregateQueryOutput] = []
607
-
608
- # Iterate over the grouped aggregates and reconcile the data
609
- # Zip all the keys, read the source, target data for each Aggregate query
610
- # and reconcile on the aggregate data
611
- # For e.g., (source_query_GRP1, target_query_GRP1), (source_query_GRP2, target_query_GRP2)
612
- for src_query_with_rules, tgt_query_with_rules in zip(src_agg_queries, tgt_agg_queries):
613
- # For each Aggregate query, read the Source and Target Data and add a hash column
614
-
615
- rules_reconcile_output: list[AggregateQueryOutput] = []
616
- src_data = None
617
- tgt_data = None
618
- joined_df = None
619
- data_source_exception = None
620
- try:
621
- src_data = self._source.read_data(
622
- catalog=self._database_config.source_catalog,
623
- schema=self._database_config.source_schema,
624
- table=table_conf.source_name,
625
- query=src_query_with_rules.query,
626
- options=table_conf.jdbc_reader_options,
627
- )
628
- tgt_data = self._target.read_data(
629
- catalog=self._database_config.target_catalog,
630
- schema=self._database_config.target_schema,
631
- table=table_conf.target_name,
632
- query=tgt_query_with_rules.query,
633
- options=table_conf.jdbc_reader_options,
634
- )
635
- # Join the Source and Target Aggregated data
636
- joined_df = join_aggregate_data(
637
- source=src_data,
638
- target=tgt_data,
639
- key_columns=src_query_with_rules.group_by_columns,
640
- spark=self._spark,
641
- path=f"{volume_path}{src_query_with_rules.group_by_columns_as_str}",
642
- )
643
- except DataSourceRuntimeException as e:
644
- data_source_exception = e
645
-
646
- # For each Aggregated Query, reconcile the data based on the rule
647
- for rule in src_query_with_rules.rules:
648
- if data_source_exception:
649
- rule_reconcile_output = DataReconcileOutput(exception=str(data_source_exception))
650
- else:
651
- rule_reconcile_output = reconcile_agg_data_per_rule(
652
- joined_df, src_data.columns, tgt_data.columns, rule
653
- )
654
- rules_reconcile_output.append(AggregateQueryOutput(rule=rule, reconcile_output=rule_reconcile_output))
655
-
656
- # For each table, there could be many Aggregated queries.
657
- # Collect the list of Rule Reconcile output per each Aggregate query and append it to the list
658
- table_agg_output.extend(rules_reconcile_output)
659
- return table_agg_output
660
-
661
- def _get_sample_data(
662
- self,
663
- table_conf,
664
- reconcile_output,
665
- src_schema,
666
- tgt_schema,
667
- ):
668
- mismatch = None
669
- missing_in_src = None
670
- missing_in_tgt = None
671
-
672
- if (
673
- reconcile_output.mismatch_count > 0
674
- or reconcile_output.missing_in_src_count > 0
675
- or reconcile_output.missing_in_tgt_count > 0
676
- ):
677
- src_sampler = SamplingQueryBuilder(table_conf, src_schema, "source", self._source_engine)
678
- tgt_sampler = SamplingQueryBuilder(table_conf, tgt_schema, "target", self._target_engine)
679
- if reconcile_output.mismatch_count > 0:
680
- mismatch = self._get_mismatch_data(
681
- src_sampler,
682
- tgt_sampler,
683
- reconcile_output.mismatch_count,
684
- reconcile_output.mismatch.mismatch_df,
685
- table_conf.join_columns,
686
- table_conf.source_name,
687
- table_conf.target_name,
688
- table_conf.sampling_options,
689
- )
690
-
691
- if reconcile_output.missing_in_src_count > 0:
692
- missing_in_src = _get_missing_data(
693
- self._target,
694
- tgt_sampler,
695
- reconcile_output.missing_in_src,
696
- self._database_config.target_catalog,
697
- self._database_config.target_schema,
698
- table_conf.target_name,
699
- )
700
-
701
- if reconcile_output.missing_in_tgt_count > 0:
702
- missing_in_tgt = _get_missing_data(
703
- self._source,
704
- src_sampler,
705
- reconcile_output.missing_in_tgt,
706
- self._database_config.source_catalog,
707
- self._database_config.source_schema,
708
- table_conf.source_name,
709
- )
710
-
711
- return DataReconcileOutput(
712
- mismatch=mismatch,
713
- mismatch_count=reconcile_output.mismatch_count,
714
- missing_in_src_count=reconcile_output.missing_in_src_count,
715
- missing_in_tgt_count=reconcile_output.missing_in_tgt_count,
716
- missing_in_src=missing_in_src,
717
- missing_in_tgt=missing_in_tgt,
718
- )
719
-
720
- def _get_mismatch_data(
721
- self,
722
- src_sampler,
723
- tgt_sampler,
724
- mismatch_count,
725
- mismatch,
726
- key_columns,
727
- src_table: str,
728
- tgt_table: str,
729
- sampling_options: SamplingOptions,
730
- ):
731
-
732
- tgt_sampling_query = tgt_sampler.build_query_with_alias()
733
-
734
- sampling_model_target = self._target.read_data(
735
- catalog=self._database_config.target_catalog,
736
- schema=self._database_config.target_schema,
737
- table=tgt_table,
738
- query=tgt_sampling_query,
739
- options=None,
740
- )
741
-
742
- # Uses pre-calculated `mismatch_count` from `reconcile_output.mismatch_count` to avoid from recomputing `mismatch` for RandomSampler.
743
- mismatch_sampler = SamplerFactory.get_sampler(sampling_options)
744
- df = mismatch_sampler.sample(mismatch, mismatch_count, key_columns, sampling_model_target).cache()
745
-
746
- src_mismatch_sample_query = src_sampler.build_query(df)
747
- tgt_mismatch_sample_query = tgt_sampler.build_query(df)
748
-
749
- src_data = self._source.read_data(
750
- catalog=self._database_config.source_catalog,
751
- schema=self._database_config.source_schema,
752
- table=src_table,
753
- query=src_mismatch_sample_query,
754
- options=None,
755
- )
756
- tgt_data = self._target.read_data(
757
- catalog=self._database_config.target_catalog,
758
- schema=self._database_config.target_schema,
759
- table=tgt_table,
760
- query=tgt_mismatch_sample_query,
761
- options=None,
762
- )
763
-
764
- return capture_mismatch_data_and_columns(source=src_data, target=tgt_data, key_columns=key_columns)
765
-
766
- def _reconcile_threshold_data(
767
- self,
768
- table_conf: Table,
769
- src_schema: list[Schema],
770
- tgt_schema: list[Schema],
771
- ):
772
-
773
- src_data, tgt_data = self._get_threshold_data(table_conf, src_schema, tgt_schema)
774
-
775
- source_view = f"source_{table_conf.source_name}_df_threshold_vw"
776
- target_view = f"target_{table_conf.target_name}_df_threshold_vw"
777
-
778
- src_data.createOrReplaceTempView(source_view)
779
- tgt_data.createOrReplaceTempView(target_view)
780
-
781
- return self._compute_threshold_comparison(table_conf, src_schema)
782
-
783
- def _get_threshold_data(
784
- self,
785
- table_conf: Table,
786
- src_schema: list[Schema],
787
- tgt_schema: list[Schema],
788
- ) -> tuple[DataFrame, DataFrame]:
789
- src_threshold_query = ThresholdQueryBuilder(
790
- table_conf, src_schema, "source", self._source_engine
791
- ).build_threshold_query()
792
- tgt_threshold_query = ThresholdQueryBuilder(
793
- table_conf, tgt_schema, "target", self._target_engine
794
- ).build_threshold_query()
795
-
796
- src_data = self._source.read_data(
797
- catalog=self._database_config.source_catalog,
798
- schema=self._database_config.source_schema,
799
- table=table_conf.source_name,
800
- query=src_threshold_query,
801
- options=table_conf.jdbc_reader_options,
802
- )
803
- tgt_data = self._target.read_data(
804
- catalog=self._database_config.target_catalog,
805
- schema=self._database_config.target_schema,
806
- table=table_conf.target_name,
807
- query=tgt_threshold_query,
808
- options=table_conf.jdbc_reader_options,
809
- )
810
-
811
- return src_data, tgt_data
812
-
813
- def _compute_threshold_comparison(self, table_conf: Table, src_schema: list[Schema]) -> ThresholdOutput:
814
- threshold_comparison_query = ThresholdQueryBuilder(
815
- table_conf, src_schema, "target", self._target_engine
816
- ).build_comparison_query()
817
-
818
- threshold_result = self._target.read_data(
819
- catalog=self._database_config.target_catalog,
820
- schema=self._database_config.target_schema,
821
- table=table_conf.target_name,
822
- query=threshold_comparison_query,
823
- options=table_conf.jdbc_reader_options,
824
- )
825
- threshold_columns = table_conf.get_threshold_columns("source")
826
- failed_where_cond = " OR ".join([name + "_match = 'Failed'" for name in threshold_columns])
827
- mismatched_df = threshold_result.filter(failed_where_cond)
828
- mismatched_count = mismatched_df.count()
829
- threshold_df = None
830
- if mismatched_count > 0:
831
- threshold_df = mismatched_df.limit(_SAMPLE_ROWS)
832
-
833
- return ThresholdOutput(threshold_df=threshold_df, threshold_mismatch_count=mismatched_count)
834
-
835
- def get_record_count(self, table_conf: Table, report_type: str) -> ReconcileRecordCount:
836
- if report_type != "schema":
837
- source_count_query = CountQueryBuilder(table_conf, "source", self._source_engine).build_query()
838
- target_count_query = CountQueryBuilder(table_conf, "target", self._target_engine).build_query()
839
- source_count_row = self._source.read_data(
840
- catalog=self._database_config.source_catalog,
841
- schema=self._database_config.source_schema,
842
- table=table_conf.source_name,
843
- query=source_count_query,
844
- options=None,
845
- ).first()
846
- target_count_row = self._target.read_data(
847
- catalog=self._database_config.target_catalog,
848
- schema=self._database_config.target_schema,
849
- table=table_conf.target_name,
850
- query=target_count_query,
851
- options=None,
852
- ).first()
853
-
854
- source_count = int(source_count_row[0]) if source_count_row is not None else 0
855
- target_count = int(target_count_row[0]) if target_count_row is not None else 0
856
-
857
- return ReconcileRecordCount(source=int(source_count), target=int(target_count))
858
- return ReconcileRecordCount()
859
-
860
-
861
- def _get_schema(
862
- source: DataSource,
863
- target: DataSource,
864
- table_conf: Table,
865
- database_config: DatabaseConfig,
866
- ) -> tuple[list[Schema], list[Schema]]:
867
- src_schema = source.get_schema(
868
- catalog=database_config.source_catalog,
869
- schema=database_config.source_schema,
870
- table=table_conf.source_name,
871
- )
872
- tgt_schema = target.get_schema(
873
- catalog=database_config.target_catalog,
874
- schema=database_config.target_schema,
875
- table=table_conf.target_name,
876
- )
877
-
878
- return src_schema, tgt_schema
879
-
880
-
881
- def _run_reconcile_data(
882
- reconciler: Reconciliation,
883
- table_conf: Table,
884
- src_schema: list[Schema],
885
- tgt_schema: list[Schema],
886
- ) -> DataReconcileOutput:
887
- try:
888
- return reconciler.reconcile_data(table_conf=table_conf, src_schema=src_schema, tgt_schema=tgt_schema)
889
- except DataSourceRuntimeException as e:
890
- return DataReconcileOutput(exception=str(e))
891
-
892
-
893
- def _run_reconcile_schema(
894
- reconciler: Reconciliation,
895
- table_conf: Table,
896
- src_schema: list[Schema],
897
- tgt_schema: list[Schema],
898
- ):
899
- try:
900
- return reconciler.reconcile_schema(table_conf=table_conf, src_schema=src_schema, tgt_schema=tgt_schema)
901
- except PySparkException as e:
902
- return SchemaReconcileOutput(is_valid=False, exception=str(e))
903
-
904
-
905
- def _run_reconcile_aggregates(
906
- reconciler: Reconciliation,
907
- table_conf: Table,
908
- src_schema: list[Schema],
909
- tgt_schema: list[Schema],
910
- ) -> list[AggregateQueryOutput]:
911
- try:
912
- return reconciler.reconcile_aggregates(table_conf, src_schema, tgt_schema)
913
- except DataSourceRuntimeException as e:
914
- return [AggregateQueryOutput(reconcile_output=DataReconcileOutput(exception=str(e)), rule=None)]
915
-
916
-
917
116
  if __name__ == "__main__":
918
117
  if "DATABRICKS_RUNTIME_VERSION" not in os.environ:
919
118
  raise SystemExit("Only intended to run in Databricks Runtime")