databricks-labs-lakebridge 0.10.6__py3-none-any.whl → 0.10.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- databricks/labs/lakebridge/__about__.py +1 -1
- databricks/labs/lakebridge/analyzer/__init__.py +0 -0
- databricks/labs/lakebridge/analyzer/lakebridge_analyzer.py +95 -0
- databricks/labs/lakebridge/base_install.py +24 -3
- databricks/labs/lakebridge/cli.py +19 -53
- databricks/labs/lakebridge/contexts/application.py +7 -0
- databricks/labs/lakebridge/deployment/job.py +2 -2
- databricks/labs/lakebridge/helpers/file_utils.py +36 -0
- databricks/labs/lakebridge/install.py +187 -157
- databricks/labs/lakebridge/reconcile/compare.py +70 -33
- databricks/labs/lakebridge/reconcile/connectors/data_source.py +19 -0
- databricks/labs/lakebridge/reconcile/connectors/databricks.py +11 -1
- databricks/labs/lakebridge/reconcile/connectors/dialect_utils.py +126 -0
- databricks/labs/lakebridge/reconcile/connectors/models.py +7 -0
- databricks/labs/lakebridge/reconcile/connectors/oracle.py +11 -1
- databricks/labs/lakebridge/reconcile/connectors/snowflake.py +14 -2
- databricks/labs/lakebridge/reconcile/connectors/tsql.py +27 -2
- databricks/labs/lakebridge/reconcile/constants.py +4 -3
- databricks/labs/lakebridge/reconcile/execute.py +9 -810
- databricks/labs/lakebridge/reconcile/normalize_recon_config_service.py +133 -0
- databricks/labs/lakebridge/reconcile/query_builder/base.py +3 -7
- databricks/labs/lakebridge/reconcile/recon_config.py +3 -0
- databricks/labs/lakebridge/reconcile/recon_output_config.py +2 -1
- databricks/labs/lakebridge/reconcile/reconciliation.py +508 -0
- databricks/labs/lakebridge/reconcile/schema_compare.py +26 -19
- databricks/labs/lakebridge/reconcile/trigger_recon_aggregate_service.py +98 -0
- databricks/labs/lakebridge/reconcile/trigger_recon_service.py +253 -0
- databricks/labs/lakebridge/reconcile/utils.py +38 -0
- databricks/labs/lakebridge/transpiler/lsp/lsp_engine.py +45 -60
- databricks/labs/lakebridge/transpiler/sqlglot/dialect_utils.py +2 -0
- databricks/labs/lakebridge/transpiler/transpile_engine.py +0 -18
- {databricks_labs_lakebridge-0.10.6.dist-info → databricks_labs_lakebridge-0.10.7.dist-info}/METADATA +1 -1
- {databricks_labs_lakebridge-0.10.6.dist-info → databricks_labs_lakebridge-0.10.7.dist-info}/RECORD +37 -28
- {databricks_labs_lakebridge-0.10.6.dist-info → databricks_labs_lakebridge-0.10.7.dist-info}/WHEEL +0 -0
- {databricks_labs_lakebridge-0.10.6.dist-info → databricks_labs_lakebridge-0.10.7.dist-info}/entry_points.txt +0 -0
- {databricks_labs_lakebridge-0.10.6.dist-info → databricks_labs_lakebridge-0.10.7.dist-info}/licenses/LICENSE +0 -0
- {databricks_labs_lakebridge-0.10.6.dist-info → databricks_labs_lakebridge-0.10.7.dist-info}/licenses/NOTICE +0 -0
@@ -1,81 +1,26 @@
|
|
1
1
|
import logging
|
2
|
-
import sys
|
3
2
|
import os
|
4
|
-
|
5
|
-
from uuid import uuid4
|
3
|
+
import sys
|
6
4
|
|
7
|
-
from
|
8
|
-
from
|
9
|
-
from
|
5
|
+
from databricks.connect import DatabricksSession
|
6
|
+
from databricks.labs.blueprint.installation import Installation
|
7
|
+
from databricks.sdk import WorkspaceClient
|
10
8
|
|
11
9
|
from databricks.labs.lakebridge.config import (
|
12
|
-
DatabaseConfig,
|
13
10
|
TableRecon,
|
14
11
|
ReconcileConfig,
|
15
|
-
ReconcileMetadataConfig,
|
16
12
|
)
|
17
|
-
from databricks.labs.lakebridge.transpiler.sqlglot.dialect_utils import get_dialect
|
18
|
-
from databricks.labs.lakebridge.reconcile.compare import (
|
19
|
-
capture_mismatch_data_and_columns,
|
20
|
-
reconcile_data,
|
21
|
-
join_aggregate_data,
|
22
|
-
reconcile_agg_data_per_rule,
|
23
|
-
)
|
24
|
-
from databricks.labs.lakebridge.reconcile.connectors.data_source import DataSource
|
25
|
-
from databricks.labs.lakebridge.reconcile.connectors.source_adapter import create_adapter
|
26
13
|
from databricks.labs.lakebridge.reconcile.exception import (
|
27
|
-
DataSourceRuntimeException,
|
28
|
-
InvalidInputException,
|
29
14
|
ReconciliationException,
|
30
15
|
)
|
31
|
-
from databricks.labs.lakebridge.reconcile.
|
32
|
-
from databricks.labs.lakebridge.reconcile.query_builder.count_query import CountQueryBuilder
|
33
|
-
from databricks.labs.lakebridge.reconcile.query_builder.hash_query import HashQueryBuilder
|
34
|
-
from databricks.labs.lakebridge.reconcile.query_builder.sampling_query import (
|
35
|
-
SamplingQueryBuilder,
|
36
|
-
)
|
37
|
-
from databricks.labs.lakebridge.reconcile.query_builder.threshold_query import (
|
38
|
-
ThresholdQueryBuilder,
|
39
|
-
)
|
40
|
-
from databricks.labs.lakebridge.reconcile.recon_capture import (
|
41
|
-
ReconCapture,
|
42
|
-
generate_final_reconcile_output,
|
43
|
-
ReconIntermediatePersist,
|
44
|
-
generate_final_reconcile_aggregate_output,
|
45
|
-
)
|
16
|
+
from databricks.labs.lakebridge.reconcile.trigger_recon_aggregate_service import TriggerReconAggregateService
|
46
17
|
from databricks.labs.lakebridge.reconcile.recon_config import (
|
47
|
-
Schema,
|
48
|
-
Table,
|
49
|
-
AggregateQueryRules,
|
50
|
-
SamplingOptions,
|
51
18
|
RECONCILE_OPERATION_NAME,
|
52
19
|
AGG_RECONCILE_OPERATION_NAME,
|
53
20
|
)
|
54
|
-
from databricks.labs.lakebridge.reconcile.
|
55
|
-
DataReconcileOutput,
|
56
|
-
ReconcileOutput,
|
57
|
-
ReconcileProcessDuration,
|
58
|
-
SchemaReconcileOutput,
|
59
|
-
ThresholdOutput,
|
60
|
-
ReconcileRecordCount,
|
61
|
-
AggregateQueryOutput,
|
62
|
-
)
|
63
|
-
from databricks.labs.lakebridge.reconcile.sampler import SamplerFactory
|
64
|
-
from databricks.labs.lakebridge.reconcile.schema_compare import SchemaCompare
|
65
|
-
from databricks.labs.lakebridge.transpiler.execute import verify_workspace_client
|
66
|
-
from databricks.sdk import WorkspaceClient
|
67
|
-
from databricks.labs.blueprint.installation import Installation
|
68
|
-
from databricks.connect import DatabricksSession
|
21
|
+
from databricks.labs.lakebridge.reconcile.trigger_recon_service import TriggerReconService
|
69
22
|
|
70
23
|
logger = logging.getLogger(__name__)
|
71
|
-
_SAMPLE_ROWS = 50
|
72
|
-
|
73
|
-
|
74
|
-
def validate_input(input_value: str, list_of_value: set, message: str):
|
75
|
-
if input_value not in list_of_value:
|
76
|
-
error_message = f"{message} --> {input_value} is not one of {list_of_value}"
|
77
|
-
logger.error(error_message)
|
78
|
-
raise InvalidInputException(error_message)
|
79
24
|
|
80
25
|
|
81
26
|
def main(*argv) -> None:
|
@@ -118,7 +63,7 @@ def _trigger_recon(
|
|
118
63
|
reconcile_config: ReconcileConfig,
|
119
64
|
):
|
120
65
|
try:
|
121
|
-
recon_output =
|
66
|
+
recon_output = TriggerReconService.trigger_recon(
|
122
67
|
ws=w,
|
123
68
|
spark=DatabricksSession.builder.getOrCreate(),
|
124
69
|
table_recon=table_recon,
|
@@ -154,7 +99,8 @@ def _trigger_reconcile_aggregates(
|
|
154
99
|
after logging the error details.
|
155
100
|
"""
|
156
101
|
try:
|
157
|
-
|
102
|
+
reconcile_config.report_type = "aggregate"
|
103
|
+
recon_output = TriggerReconAggregateService.trigger_recon_aggregates(
|
158
104
|
ws=ws,
|
159
105
|
spark=DatabricksSession.builder.getOrCreate(),
|
160
106
|
table_recon=table_recon,
|
@@ -167,753 +113,6 @@ def _trigger_reconcile_aggregates(
|
|
167
113
|
raise e
|
168
114
|
|
169
115
|
|
170
|
-
def recon(
|
171
|
-
ws: WorkspaceClient,
|
172
|
-
spark: SparkSession,
|
173
|
-
table_recon: TableRecon,
|
174
|
-
reconcile_config: ReconcileConfig,
|
175
|
-
local_test_run: bool = False,
|
176
|
-
) -> ReconcileOutput:
|
177
|
-
"""[EXPERIMENTAL] Reconcile the data between the source and target tables."""
|
178
|
-
# verify the workspace client and add proper product and version details
|
179
|
-
# TODO For now we are utilising the
|
180
|
-
# verify_workspace_client from transpile/execute.py file. Later verify_workspace_client function has to be
|
181
|
-
# refactored
|
182
|
-
|
183
|
-
ws_client: WorkspaceClient = verify_workspace_client(ws)
|
184
|
-
|
185
|
-
# validate the report type
|
186
|
-
report_type = reconcile_config.report_type.lower()
|
187
|
-
logger.info(f"report_type: {report_type}, data_source: {reconcile_config.data_source} ")
|
188
|
-
validate_input(report_type, {"schema", "data", "row", "all"}, "Invalid report type")
|
189
|
-
|
190
|
-
source, target = initialise_data_source(
|
191
|
-
engine=get_dialect(reconcile_config.data_source),
|
192
|
-
spark=spark,
|
193
|
-
ws=ws_client,
|
194
|
-
secret_scope=reconcile_config.secret_scope,
|
195
|
-
)
|
196
|
-
|
197
|
-
recon_id = str(uuid4())
|
198
|
-
# initialise the Reconciliation
|
199
|
-
reconciler = Reconciliation(
|
200
|
-
source,
|
201
|
-
target,
|
202
|
-
reconcile_config.database_config,
|
203
|
-
report_type,
|
204
|
-
SchemaCompare(spark=spark),
|
205
|
-
get_dialect(reconcile_config.data_source),
|
206
|
-
spark,
|
207
|
-
metadata_config=reconcile_config.metadata_config,
|
208
|
-
)
|
209
|
-
|
210
|
-
# initialise the recon capture class
|
211
|
-
recon_capture = ReconCapture(
|
212
|
-
database_config=reconcile_config.database_config,
|
213
|
-
recon_id=recon_id,
|
214
|
-
report_type=report_type,
|
215
|
-
source_dialect=get_dialect(reconcile_config.data_source),
|
216
|
-
ws=ws_client,
|
217
|
-
spark=spark,
|
218
|
-
metadata_config=reconcile_config.metadata_config,
|
219
|
-
local_test_run=local_test_run,
|
220
|
-
)
|
221
|
-
|
222
|
-
for table_conf in table_recon.tables:
|
223
|
-
recon_process_duration = ReconcileProcessDuration(start_ts=str(datetime.now()), end_ts=None)
|
224
|
-
schema_reconcile_output = SchemaReconcileOutput(is_valid=True)
|
225
|
-
data_reconcile_output = DataReconcileOutput()
|
226
|
-
try:
|
227
|
-
src_schema, tgt_schema = _get_schema(
|
228
|
-
source=source, target=target, table_conf=table_conf, database_config=reconcile_config.database_config
|
229
|
-
)
|
230
|
-
except DataSourceRuntimeException as e:
|
231
|
-
schema_reconcile_output = SchemaReconcileOutput(is_valid=False, exception=str(e))
|
232
|
-
else:
|
233
|
-
if report_type in {"schema", "all"}:
|
234
|
-
schema_reconcile_output = _run_reconcile_schema(
|
235
|
-
reconciler=reconciler, table_conf=table_conf, src_schema=src_schema, tgt_schema=tgt_schema
|
236
|
-
)
|
237
|
-
logger.warning("Schema comparison is completed.")
|
238
|
-
|
239
|
-
if report_type in {"data", "row", "all"}:
|
240
|
-
data_reconcile_output = _run_reconcile_data(
|
241
|
-
reconciler=reconciler, table_conf=table_conf, src_schema=src_schema, tgt_schema=tgt_schema
|
242
|
-
)
|
243
|
-
logger.warning(f"Reconciliation for '{report_type}' report completed.")
|
244
|
-
|
245
|
-
recon_process_duration.end_ts = str(datetime.now())
|
246
|
-
# Persist the data to the delta tables
|
247
|
-
recon_capture.start(
|
248
|
-
data_reconcile_output=data_reconcile_output,
|
249
|
-
schema_reconcile_output=schema_reconcile_output,
|
250
|
-
table_conf=table_conf,
|
251
|
-
recon_process_duration=recon_process_duration,
|
252
|
-
record_count=reconciler.get_record_count(table_conf, report_type),
|
253
|
-
)
|
254
|
-
if report_type != "schema":
|
255
|
-
ReconIntermediatePersist(
|
256
|
-
spark=spark, path=generate_volume_path(table_conf, reconcile_config.metadata_config)
|
257
|
-
).clean_unmatched_df_from_volume()
|
258
|
-
|
259
|
-
return _verify_successful_reconciliation(
|
260
|
-
generate_final_reconcile_output(
|
261
|
-
recon_id=recon_id,
|
262
|
-
spark=spark,
|
263
|
-
metadata_config=reconcile_config.metadata_config,
|
264
|
-
local_test_run=local_test_run,
|
265
|
-
)
|
266
|
-
)
|
267
|
-
|
268
|
-
|
269
|
-
def _verify_successful_reconciliation(
|
270
|
-
reconcile_output: ReconcileOutput, operation_name: str = "reconcile"
|
271
|
-
) -> ReconcileOutput:
|
272
|
-
for table_output in reconcile_output.results:
|
273
|
-
if table_output.exception_message or (
|
274
|
-
table_output.status.column is False
|
275
|
-
or table_output.status.row is False
|
276
|
-
or table_output.status.schema is False
|
277
|
-
or table_output.status.aggregate is False
|
278
|
-
):
|
279
|
-
raise ReconciliationException(
|
280
|
-
f" Reconciliation failed for one or more tables. Please check the recon metrics for more details."
|
281
|
-
f" **{operation_name}** failed.",
|
282
|
-
reconcile_output=reconcile_output,
|
283
|
-
)
|
284
|
-
|
285
|
-
logger.info("Reconciliation completed successfully.")
|
286
|
-
return reconcile_output
|
287
|
-
|
288
|
-
|
289
|
-
def generate_volume_path(table_conf: Table, metadata_config: ReconcileMetadataConfig):
|
290
|
-
catalog = metadata_config.catalog
|
291
|
-
schema = metadata_config.schema
|
292
|
-
return f"/Volumes/{catalog}/{schema}/{metadata_config.volume}/{table_conf.source_name}_{table_conf.target_name}/"
|
293
|
-
|
294
|
-
|
295
|
-
def initialise_data_source(
|
296
|
-
ws: WorkspaceClient,
|
297
|
-
spark: SparkSession,
|
298
|
-
engine: Dialect,
|
299
|
-
secret_scope: str,
|
300
|
-
):
|
301
|
-
source = create_adapter(engine=engine, spark=spark, ws=ws, secret_scope=secret_scope)
|
302
|
-
target = create_adapter(engine=get_dialect("databricks"), spark=spark, ws=ws, secret_scope=secret_scope)
|
303
|
-
|
304
|
-
return source, target
|
305
|
-
|
306
|
-
|
307
|
-
def _get_missing_data(
|
308
|
-
reader: DataSource,
|
309
|
-
sampler: SamplingQueryBuilder,
|
310
|
-
missing_df: DataFrame,
|
311
|
-
catalog: str,
|
312
|
-
schema: str,
|
313
|
-
table_name: str,
|
314
|
-
) -> DataFrame:
|
315
|
-
sample_query = sampler.build_query(missing_df)
|
316
|
-
return reader.read_data(
|
317
|
-
catalog=catalog,
|
318
|
-
schema=schema,
|
319
|
-
table=table_name,
|
320
|
-
query=sample_query,
|
321
|
-
options=None,
|
322
|
-
)
|
323
|
-
|
324
|
-
|
325
|
-
def reconcile_aggregates(
|
326
|
-
ws: WorkspaceClient,
|
327
|
-
spark: SparkSession,
|
328
|
-
table_recon: TableRecon,
|
329
|
-
reconcile_config: ReconcileConfig,
|
330
|
-
local_test_run: bool = False,
|
331
|
-
):
|
332
|
-
"""[EXPERIMENTAL] Reconcile the aggregated data between the source and target tables.
|
333
|
-
for e.g., COUNT, SUM, AVG of columns between source and target with or without any specific key/group by columns
|
334
|
-
Supported Aggregate functions: MIN, MAX, COUNT, SUM, AVG, MEAN, MODE, PERCENTILE, STDDEV, VARIANCE, MEDIAN
|
335
|
-
"""
|
336
|
-
# verify the workspace client and add proper product and version details
|
337
|
-
# TODO For now we are utilising the
|
338
|
-
# verify_workspace_client from transpile/execute.py file. Later verify_workspace_client function has to be
|
339
|
-
# refactored
|
340
|
-
|
341
|
-
ws_client: WorkspaceClient = verify_workspace_client(ws)
|
342
|
-
|
343
|
-
report_type = ""
|
344
|
-
if report_type:
|
345
|
-
logger.info(f"report_type: {report_type}")
|
346
|
-
logger.info(f"data_source: {reconcile_config.data_source}")
|
347
|
-
|
348
|
-
# Read the reconcile_config and initialise the source and target data sources. Target is always Databricks
|
349
|
-
source, target = initialise_data_source(
|
350
|
-
engine=get_dialect(reconcile_config.data_source),
|
351
|
-
spark=spark,
|
352
|
-
ws=ws_client,
|
353
|
-
secret_scope=reconcile_config.secret_scope,
|
354
|
-
)
|
355
|
-
|
356
|
-
# Generate Unique recon_id for every run
|
357
|
-
recon_id = str(uuid4())
|
358
|
-
|
359
|
-
# initialise the Reconciliation
|
360
|
-
reconciler = Reconciliation(
|
361
|
-
source,
|
362
|
-
target,
|
363
|
-
reconcile_config.database_config,
|
364
|
-
report_type,
|
365
|
-
SchemaCompare(spark=spark),
|
366
|
-
get_dialect(reconcile_config.data_source),
|
367
|
-
spark,
|
368
|
-
metadata_config=reconcile_config.metadata_config,
|
369
|
-
)
|
370
|
-
|
371
|
-
# initialise the recon capture class
|
372
|
-
recon_capture = ReconCapture(
|
373
|
-
database_config=reconcile_config.database_config,
|
374
|
-
recon_id=recon_id,
|
375
|
-
report_type=report_type,
|
376
|
-
source_dialect=get_dialect(reconcile_config.data_source),
|
377
|
-
ws=ws_client,
|
378
|
-
spark=spark,
|
379
|
-
metadata_config=reconcile_config.metadata_config,
|
380
|
-
local_test_run=local_test_run,
|
381
|
-
)
|
382
|
-
|
383
|
-
# Get the Aggregated Reconciliation Output for each table
|
384
|
-
for table_conf in table_recon.tables:
|
385
|
-
recon_process_duration = ReconcileProcessDuration(start_ts=str(datetime.now()), end_ts=None)
|
386
|
-
try:
|
387
|
-
src_schema, tgt_schema = _get_schema(
|
388
|
-
source=source,
|
389
|
-
target=target,
|
390
|
-
table_conf=table_conf,
|
391
|
-
database_config=reconcile_config.database_config,
|
392
|
-
)
|
393
|
-
except DataSourceRuntimeException as e:
|
394
|
-
raise ReconciliationException(message=str(e)) from e
|
395
|
-
|
396
|
-
assert table_conf.aggregates, "Aggregates must be defined for Aggregates Reconciliation"
|
397
|
-
|
398
|
-
table_reconcile_agg_output_list: list[AggregateQueryOutput] = _run_reconcile_aggregates(
|
399
|
-
reconciler=reconciler,
|
400
|
-
table_conf=table_conf,
|
401
|
-
src_schema=src_schema,
|
402
|
-
tgt_schema=tgt_schema,
|
403
|
-
)
|
404
|
-
|
405
|
-
recon_process_duration.end_ts = str(datetime.now())
|
406
|
-
|
407
|
-
# Persist the data to the delta tables
|
408
|
-
recon_capture.store_aggregates_metrics(
|
409
|
-
reconcile_agg_output_list=table_reconcile_agg_output_list,
|
410
|
-
table_conf=table_conf,
|
411
|
-
recon_process_duration=recon_process_duration,
|
412
|
-
)
|
413
|
-
|
414
|
-
(
|
415
|
-
ReconIntermediatePersist(
|
416
|
-
spark=spark,
|
417
|
-
path=generate_volume_path(table_conf, reconcile_config.metadata_config),
|
418
|
-
).clean_unmatched_df_from_volume()
|
419
|
-
)
|
420
|
-
|
421
|
-
return _verify_successful_reconciliation(
|
422
|
-
generate_final_reconcile_aggregate_output(
|
423
|
-
recon_id=recon_id,
|
424
|
-
spark=spark,
|
425
|
-
metadata_config=reconcile_config.metadata_config,
|
426
|
-
local_test_run=local_test_run,
|
427
|
-
),
|
428
|
-
operation_name=AGG_RECONCILE_OPERATION_NAME,
|
429
|
-
)
|
430
|
-
|
431
|
-
|
432
|
-
class Reconciliation:
|
433
|
-
|
434
|
-
def __init__(
|
435
|
-
self,
|
436
|
-
source: DataSource,
|
437
|
-
target: DataSource,
|
438
|
-
database_config: DatabaseConfig,
|
439
|
-
report_type: str,
|
440
|
-
schema_comparator: SchemaCompare,
|
441
|
-
source_engine: Dialect,
|
442
|
-
spark: SparkSession,
|
443
|
-
metadata_config: ReconcileMetadataConfig,
|
444
|
-
):
|
445
|
-
self._source = source
|
446
|
-
self._target = target
|
447
|
-
self._report_type = report_type
|
448
|
-
self._database_config = database_config
|
449
|
-
self._schema_comparator = schema_comparator
|
450
|
-
self._target_engine = get_dialect("databricks")
|
451
|
-
self._source_engine = source_engine
|
452
|
-
self._spark = spark
|
453
|
-
self._metadata_config = metadata_config
|
454
|
-
|
455
|
-
def reconcile_data(
|
456
|
-
self,
|
457
|
-
table_conf: Table,
|
458
|
-
src_schema: list[Schema],
|
459
|
-
tgt_schema: list[Schema],
|
460
|
-
) -> DataReconcileOutput:
|
461
|
-
data_reconcile_output = self._get_reconcile_output(table_conf, src_schema, tgt_schema)
|
462
|
-
reconcile_output = data_reconcile_output
|
463
|
-
if self._report_type in {"data", "all"}:
|
464
|
-
reconcile_output = self._get_sample_data(table_conf, data_reconcile_output, src_schema, tgt_schema)
|
465
|
-
if table_conf.get_threshold_columns("source"):
|
466
|
-
reconcile_output.threshold_output = self._reconcile_threshold_data(table_conf, src_schema, tgt_schema)
|
467
|
-
|
468
|
-
if self._report_type == "row" and table_conf.get_threshold_columns("source"):
|
469
|
-
logger.warning("Threshold comparison is ignored for 'row' report type")
|
470
|
-
|
471
|
-
return reconcile_output
|
472
|
-
|
473
|
-
def reconcile_schema(
|
474
|
-
self,
|
475
|
-
src_schema: list[Schema],
|
476
|
-
tgt_schema: list[Schema],
|
477
|
-
table_conf: Table,
|
478
|
-
):
|
479
|
-
return self._schema_comparator.compare(src_schema, tgt_schema, self._source_engine, table_conf)
|
480
|
-
|
481
|
-
def reconcile_aggregates(
|
482
|
-
self,
|
483
|
-
table_conf: Table,
|
484
|
-
src_schema: list[Schema],
|
485
|
-
tgt_schema: list[Schema],
|
486
|
-
) -> list[AggregateQueryOutput]:
|
487
|
-
return self._get_reconcile_aggregate_output(table_conf, src_schema, tgt_schema)
|
488
|
-
|
489
|
-
def _get_reconcile_output(
|
490
|
-
self,
|
491
|
-
table_conf,
|
492
|
-
src_schema,
|
493
|
-
tgt_schema,
|
494
|
-
):
|
495
|
-
src_hash_query = HashQueryBuilder(table_conf, src_schema, "source", self._source_engine).build_query(
|
496
|
-
report_type=self._report_type
|
497
|
-
)
|
498
|
-
tgt_hash_query = HashQueryBuilder(table_conf, tgt_schema, "target", self._source_engine).build_query(
|
499
|
-
report_type=self._report_type
|
500
|
-
)
|
501
|
-
src_data = self._source.read_data(
|
502
|
-
catalog=self._database_config.source_catalog,
|
503
|
-
schema=self._database_config.source_schema,
|
504
|
-
table=table_conf.source_name,
|
505
|
-
query=src_hash_query,
|
506
|
-
options=table_conf.jdbc_reader_options,
|
507
|
-
)
|
508
|
-
tgt_data = self._target.read_data(
|
509
|
-
catalog=self._database_config.target_catalog,
|
510
|
-
schema=self._database_config.target_schema,
|
511
|
-
table=table_conf.target_name,
|
512
|
-
query=tgt_hash_query,
|
513
|
-
options=table_conf.jdbc_reader_options,
|
514
|
-
)
|
515
|
-
|
516
|
-
volume_path = generate_volume_path(table_conf, self._metadata_config)
|
517
|
-
return reconcile_data(
|
518
|
-
source=src_data,
|
519
|
-
target=tgt_data,
|
520
|
-
key_columns=table_conf.join_columns,
|
521
|
-
report_type=self._report_type,
|
522
|
-
spark=self._spark,
|
523
|
-
path=volume_path,
|
524
|
-
)
|
525
|
-
|
526
|
-
def _get_reconcile_aggregate_output(
|
527
|
-
self,
|
528
|
-
table_conf,
|
529
|
-
src_schema,
|
530
|
-
tgt_schema,
|
531
|
-
):
|
532
|
-
"""
|
533
|
-
Creates a single Query, for the aggregates having the same group by columns. (Ex: 1)
|
534
|
-
If there are no group by columns, all the aggregates are clubbed together in a single query. (Ex: 2)
|
535
|
-
Examples:
|
536
|
-
1. {
|
537
|
-
"type": "MIN",
|
538
|
-
"agg_cols": ["COL1"],
|
539
|
-
"group_by_cols": ["COL4"]
|
540
|
-
},
|
541
|
-
{
|
542
|
-
"type": "MAX",
|
543
|
-
"agg_cols": ["COL2"],
|
544
|
-
"group_by_cols": ["COL9"]
|
545
|
-
},
|
546
|
-
{
|
547
|
-
"type": "COUNT",
|
548
|
-
"agg_cols": ["COL2"],
|
549
|
-
"group_by_cols": ["COL9"]
|
550
|
-
},
|
551
|
-
{
|
552
|
-
"type": "AVG",
|
553
|
-
"agg_cols": ["COL3"],
|
554
|
-
"group_by_cols": ["COL4"]
|
555
|
-
},
|
556
|
-
Query 1: SELECT MIN(COL1), AVG(COL3) FROM :table GROUP BY COL4
|
557
|
-
Rules: ID | Aggregate Type | Column | Group By Column
|
558
|
-
#1, MIN, COL1, COL4
|
559
|
-
#2, AVG, COL3, COL4
|
560
|
-
-------------------------------------------------------
|
561
|
-
Query 2: SELECT MAX(COL2), COUNT(COL2) FROM :table GROUP BY COL9
|
562
|
-
Rules: ID | Aggregate Type | Column | Group By Column
|
563
|
-
#1, MAX, COL2, COL9
|
564
|
-
#2, COUNT, COL2, COL9
|
565
|
-
2. {
|
566
|
-
"type": "MAX",
|
567
|
-
"agg_cols": ["COL1"]
|
568
|
-
},
|
569
|
-
{
|
570
|
-
"type": "SUM",
|
571
|
-
"agg_cols": ["COL2"]
|
572
|
-
},
|
573
|
-
{
|
574
|
-
"type": "MAX",
|
575
|
-
"agg_cols": ["COL3"]
|
576
|
-
}
|
577
|
-
Query: SELECT MAX(COL1), SUM(COL2), MAX(COL3) FROM :table
|
578
|
-
Rules: ID | Aggregate Type | Column | Group By Column
|
579
|
-
#1, MAX, COL1,
|
580
|
-
#2, SUM, COL2,
|
581
|
-
#3, MAX, COL3,
|
582
|
-
"""
|
583
|
-
|
584
|
-
src_query_builder = AggregateQueryBuilder(
|
585
|
-
table_conf,
|
586
|
-
src_schema,
|
587
|
-
"source",
|
588
|
-
self._source_engine,
|
589
|
-
)
|
590
|
-
|
591
|
-
# build Aggregate queries for source,
|
592
|
-
src_agg_queries: list[AggregateQueryRules] = src_query_builder.build_queries()
|
593
|
-
|
594
|
-
# There could be one or more queries per table based on the group by columns
|
595
|
-
|
596
|
-
# build Aggregate queries for target(Databricks),
|
597
|
-
tgt_agg_queries: list[AggregateQueryRules] = AggregateQueryBuilder(
|
598
|
-
table_conf,
|
599
|
-
tgt_schema,
|
600
|
-
"target",
|
601
|
-
self._target_engine,
|
602
|
-
).build_queries()
|
603
|
-
|
604
|
-
volume_path = generate_volume_path(table_conf, self._metadata_config)
|
605
|
-
|
606
|
-
table_agg_output: list[AggregateQueryOutput] = []
|
607
|
-
|
608
|
-
# Iterate over the grouped aggregates and reconcile the data
|
609
|
-
# Zip all the keys, read the source, target data for each Aggregate query
|
610
|
-
# and reconcile on the aggregate data
|
611
|
-
# For e.g., (source_query_GRP1, target_query_GRP1), (source_query_GRP2, target_query_GRP2)
|
612
|
-
for src_query_with_rules, tgt_query_with_rules in zip(src_agg_queries, tgt_agg_queries):
|
613
|
-
# For each Aggregate query, read the Source and Target Data and add a hash column
|
614
|
-
|
615
|
-
rules_reconcile_output: list[AggregateQueryOutput] = []
|
616
|
-
src_data = None
|
617
|
-
tgt_data = None
|
618
|
-
joined_df = None
|
619
|
-
data_source_exception = None
|
620
|
-
try:
|
621
|
-
src_data = self._source.read_data(
|
622
|
-
catalog=self._database_config.source_catalog,
|
623
|
-
schema=self._database_config.source_schema,
|
624
|
-
table=table_conf.source_name,
|
625
|
-
query=src_query_with_rules.query,
|
626
|
-
options=table_conf.jdbc_reader_options,
|
627
|
-
)
|
628
|
-
tgt_data = self._target.read_data(
|
629
|
-
catalog=self._database_config.target_catalog,
|
630
|
-
schema=self._database_config.target_schema,
|
631
|
-
table=table_conf.target_name,
|
632
|
-
query=tgt_query_with_rules.query,
|
633
|
-
options=table_conf.jdbc_reader_options,
|
634
|
-
)
|
635
|
-
# Join the Source and Target Aggregated data
|
636
|
-
joined_df = join_aggregate_data(
|
637
|
-
source=src_data,
|
638
|
-
target=tgt_data,
|
639
|
-
key_columns=src_query_with_rules.group_by_columns,
|
640
|
-
spark=self._spark,
|
641
|
-
path=f"{volume_path}{src_query_with_rules.group_by_columns_as_str}",
|
642
|
-
)
|
643
|
-
except DataSourceRuntimeException as e:
|
644
|
-
data_source_exception = e
|
645
|
-
|
646
|
-
# For each Aggregated Query, reconcile the data based on the rule
|
647
|
-
for rule in src_query_with_rules.rules:
|
648
|
-
if data_source_exception:
|
649
|
-
rule_reconcile_output = DataReconcileOutput(exception=str(data_source_exception))
|
650
|
-
else:
|
651
|
-
rule_reconcile_output = reconcile_agg_data_per_rule(
|
652
|
-
joined_df, src_data.columns, tgt_data.columns, rule
|
653
|
-
)
|
654
|
-
rules_reconcile_output.append(AggregateQueryOutput(rule=rule, reconcile_output=rule_reconcile_output))
|
655
|
-
|
656
|
-
# For each table, there could be many Aggregated queries.
|
657
|
-
# Collect the list of Rule Reconcile output per each Aggregate query and append it to the list
|
658
|
-
table_agg_output.extend(rules_reconcile_output)
|
659
|
-
return table_agg_output
|
660
|
-
|
661
|
-
def _get_sample_data(
|
662
|
-
self,
|
663
|
-
table_conf,
|
664
|
-
reconcile_output,
|
665
|
-
src_schema,
|
666
|
-
tgt_schema,
|
667
|
-
):
|
668
|
-
mismatch = None
|
669
|
-
missing_in_src = None
|
670
|
-
missing_in_tgt = None
|
671
|
-
|
672
|
-
if (
|
673
|
-
reconcile_output.mismatch_count > 0
|
674
|
-
or reconcile_output.missing_in_src_count > 0
|
675
|
-
or reconcile_output.missing_in_tgt_count > 0
|
676
|
-
):
|
677
|
-
src_sampler = SamplingQueryBuilder(table_conf, src_schema, "source", self._source_engine)
|
678
|
-
tgt_sampler = SamplingQueryBuilder(table_conf, tgt_schema, "target", self._target_engine)
|
679
|
-
if reconcile_output.mismatch_count > 0:
|
680
|
-
mismatch = self._get_mismatch_data(
|
681
|
-
src_sampler,
|
682
|
-
tgt_sampler,
|
683
|
-
reconcile_output.mismatch_count,
|
684
|
-
reconcile_output.mismatch.mismatch_df,
|
685
|
-
table_conf.join_columns,
|
686
|
-
table_conf.source_name,
|
687
|
-
table_conf.target_name,
|
688
|
-
table_conf.sampling_options,
|
689
|
-
)
|
690
|
-
|
691
|
-
if reconcile_output.missing_in_src_count > 0:
|
692
|
-
missing_in_src = _get_missing_data(
|
693
|
-
self._target,
|
694
|
-
tgt_sampler,
|
695
|
-
reconcile_output.missing_in_src,
|
696
|
-
self._database_config.target_catalog,
|
697
|
-
self._database_config.target_schema,
|
698
|
-
table_conf.target_name,
|
699
|
-
)
|
700
|
-
|
701
|
-
if reconcile_output.missing_in_tgt_count > 0:
|
702
|
-
missing_in_tgt = _get_missing_data(
|
703
|
-
self._source,
|
704
|
-
src_sampler,
|
705
|
-
reconcile_output.missing_in_tgt,
|
706
|
-
self._database_config.source_catalog,
|
707
|
-
self._database_config.source_schema,
|
708
|
-
table_conf.source_name,
|
709
|
-
)
|
710
|
-
|
711
|
-
return DataReconcileOutput(
|
712
|
-
mismatch=mismatch,
|
713
|
-
mismatch_count=reconcile_output.mismatch_count,
|
714
|
-
missing_in_src_count=reconcile_output.missing_in_src_count,
|
715
|
-
missing_in_tgt_count=reconcile_output.missing_in_tgt_count,
|
716
|
-
missing_in_src=missing_in_src,
|
717
|
-
missing_in_tgt=missing_in_tgt,
|
718
|
-
)
|
719
|
-
|
720
|
-
def _get_mismatch_data(
|
721
|
-
self,
|
722
|
-
src_sampler,
|
723
|
-
tgt_sampler,
|
724
|
-
mismatch_count,
|
725
|
-
mismatch,
|
726
|
-
key_columns,
|
727
|
-
src_table: str,
|
728
|
-
tgt_table: str,
|
729
|
-
sampling_options: SamplingOptions,
|
730
|
-
):
|
731
|
-
|
732
|
-
tgt_sampling_query = tgt_sampler.build_query_with_alias()
|
733
|
-
|
734
|
-
sampling_model_target = self._target.read_data(
|
735
|
-
catalog=self._database_config.target_catalog,
|
736
|
-
schema=self._database_config.target_schema,
|
737
|
-
table=tgt_table,
|
738
|
-
query=tgt_sampling_query,
|
739
|
-
options=None,
|
740
|
-
)
|
741
|
-
|
742
|
-
# Uses pre-calculated `mismatch_count` from `reconcile_output.mismatch_count` to avoid from recomputing `mismatch` for RandomSampler.
|
743
|
-
mismatch_sampler = SamplerFactory.get_sampler(sampling_options)
|
744
|
-
df = mismatch_sampler.sample(mismatch, mismatch_count, key_columns, sampling_model_target).cache()
|
745
|
-
|
746
|
-
src_mismatch_sample_query = src_sampler.build_query(df)
|
747
|
-
tgt_mismatch_sample_query = tgt_sampler.build_query(df)
|
748
|
-
|
749
|
-
src_data = self._source.read_data(
|
750
|
-
catalog=self._database_config.source_catalog,
|
751
|
-
schema=self._database_config.source_schema,
|
752
|
-
table=src_table,
|
753
|
-
query=src_mismatch_sample_query,
|
754
|
-
options=None,
|
755
|
-
)
|
756
|
-
tgt_data = self._target.read_data(
|
757
|
-
catalog=self._database_config.target_catalog,
|
758
|
-
schema=self._database_config.target_schema,
|
759
|
-
table=tgt_table,
|
760
|
-
query=tgt_mismatch_sample_query,
|
761
|
-
options=None,
|
762
|
-
)
|
763
|
-
|
764
|
-
return capture_mismatch_data_and_columns(source=src_data, target=tgt_data, key_columns=key_columns)
|
765
|
-
|
766
|
-
def _reconcile_threshold_data(
|
767
|
-
self,
|
768
|
-
table_conf: Table,
|
769
|
-
src_schema: list[Schema],
|
770
|
-
tgt_schema: list[Schema],
|
771
|
-
):
|
772
|
-
|
773
|
-
src_data, tgt_data = self._get_threshold_data(table_conf, src_schema, tgt_schema)
|
774
|
-
|
775
|
-
source_view = f"source_{table_conf.source_name}_df_threshold_vw"
|
776
|
-
target_view = f"target_{table_conf.target_name}_df_threshold_vw"
|
777
|
-
|
778
|
-
src_data.createOrReplaceTempView(source_view)
|
779
|
-
tgt_data.createOrReplaceTempView(target_view)
|
780
|
-
|
781
|
-
return self._compute_threshold_comparison(table_conf, src_schema)
|
782
|
-
|
783
|
-
def _get_threshold_data(
|
784
|
-
self,
|
785
|
-
table_conf: Table,
|
786
|
-
src_schema: list[Schema],
|
787
|
-
tgt_schema: list[Schema],
|
788
|
-
) -> tuple[DataFrame, DataFrame]:
|
789
|
-
src_threshold_query = ThresholdQueryBuilder(
|
790
|
-
table_conf, src_schema, "source", self._source_engine
|
791
|
-
).build_threshold_query()
|
792
|
-
tgt_threshold_query = ThresholdQueryBuilder(
|
793
|
-
table_conf, tgt_schema, "target", self._target_engine
|
794
|
-
).build_threshold_query()
|
795
|
-
|
796
|
-
src_data = self._source.read_data(
|
797
|
-
catalog=self._database_config.source_catalog,
|
798
|
-
schema=self._database_config.source_schema,
|
799
|
-
table=table_conf.source_name,
|
800
|
-
query=src_threshold_query,
|
801
|
-
options=table_conf.jdbc_reader_options,
|
802
|
-
)
|
803
|
-
tgt_data = self._target.read_data(
|
804
|
-
catalog=self._database_config.target_catalog,
|
805
|
-
schema=self._database_config.target_schema,
|
806
|
-
table=table_conf.target_name,
|
807
|
-
query=tgt_threshold_query,
|
808
|
-
options=table_conf.jdbc_reader_options,
|
809
|
-
)
|
810
|
-
|
811
|
-
return src_data, tgt_data
|
812
|
-
|
813
|
-
def _compute_threshold_comparison(self, table_conf: Table, src_schema: list[Schema]) -> ThresholdOutput:
|
814
|
-
threshold_comparison_query = ThresholdQueryBuilder(
|
815
|
-
table_conf, src_schema, "target", self._target_engine
|
816
|
-
).build_comparison_query()
|
817
|
-
|
818
|
-
threshold_result = self._target.read_data(
|
819
|
-
catalog=self._database_config.target_catalog,
|
820
|
-
schema=self._database_config.target_schema,
|
821
|
-
table=table_conf.target_name,
|
822
|
-
query=threshold_comparison_query,
|
823
|
-
options=table_conf.jdbc_reader_options,
|
824
|
-
)
|
825
|
-
threshold_columns = table_conf.get_threshold_columns("source")
|
826
|
-
failed_where_cond = " OR ".join([name + "_match = 'Failed'" for name in threshold_columns])
|
827
|
-
mismatched_df = threshold_result.filter(failed_where_cond)
|
828
|
-
mismatched_count = mismatched_df.count()
|
829
|
-
threshold_df = None
|
830
|
-
if mismatched_count > 0:
|
831
|
-
threshold_df = mismatched_df.limit(_SAMPLE_ROWS)
|
832
|
-
|
833
|
-
return ThresholdOutput(threshold_df=threshold_df, threshold_mismatch_count=mismatched_count)
|
834
|
-
|
835
|
-
def get_record_count(self, table_conf: Table, report_type: str) -> ReconcileRecordCount:
|
836
|
-
if report_type != "schema":
|
837
|
-
source_count_query = CountQueryBuilder(table_conf, "source", self._source_engine).build_query()
|
838
|
-
target_count_query = CountQueryBuilder(table_conf, "target", self._target_engine).build_query()
|
839
|
-
source_count_row = self._source.read_data(
|
840
|
-
catalog=self._database_config.source_catalog,
|
841
|
-
schema=self._database_config.source_schema,
|
842
|
-
table=table_conf.source_name,
|
843
|
-
query=source_count_query,
|
844
|
-
options=None,
|
845
|
-
).first()
|
846
|
-
target_count_row = self._target.read_data(
|
847
|
-
catalog=self._database_config.target_catalog,
|
848
|
-
schema=self._database_config.target_schema,
|
849
|
-
table=table_conf.target_name,
|
850
|
-
query=target_count_query,
|
851
|
-
options=None,
|
852
|
-
).first()
|
853
|
-
|
854
|
-
source_count = int(source_count_row[0]) if source_count_row is not None else 0
|
855
|
-
target_count = int(target_count_row[0]) if target_count_row is not None else 0
|
856
|
-
|
857
|
-
return ReconcileRecordCount(source=int(source_count), target=int(target_count))
|
858
|
-
return ReconcileRecordCount()
|
859
|
-
|
860
|
-
|
861
|
-
def _get_schema(
|
862
|
-
source: DataSource,
|
863
|
-
target: DataSource,
|
864
|
-
table_conf: Table,
|
865
|
-
database_config: DatabaseConfig,
|
866
|
-
) -> tuple[list[Schema], list[Schema]]:
|
867
|
-
src_schema = source.get_schema(
|
868
|
-
catalog=database_config.source_catalog,
|
869
|
-
schema=database_config.source_schema,
|
870
|
-
table=table_conf.source_name,
|
871
|
-
)
|
872
|
-
tgt_schema = target.get_schema(
|
873
|
-
catalog=database_config.target_catalog,
|
874
|
-
schema=database_config.target_schema,
|
875
|
-
table=table_conf.target_name,
|
876
|
-
)
|
877
|
-
|
878
|
-
return src_schema, tgt_schema
|
879
|
-
|
880
|
-
|
881
|
-
def _run_reconcile_data(
|
882
|
-
reconciler: Reconciliation,
|
883
|
-
table_conf: Table,
|
884
|
-
src_schema: list[Schema],
|
885
|
-
tgt_schema: list[Schema],
|
886
|
-
) -> DataReconcileOutput:
|
887
|
-
try:
|
888
|
-
return reconciler.reconcile_data(table_conf=table_conf, src_schema=src_schema, tgt_schema=tgt_schema)
|
889
|
-
except DataSourceRuntimeException as e:
|
890
|
-
return DataReconcileOutput(exception=str(e))
|
891
|
-
|
892
|
-
|
893
|
-
def _run_reconcile_schema(
|
894
|
-
reconciler: Reconciliation,
|
895
|
-
table_conf: Table,
|
896
|
-
src_schema: list[Schema],
|
897
|
-
tgt_schema: list[Schema],
|
898
|
-
):
|
899
|
-
try:
|
900
|
-
return reconciler.reconcile_schema(table_conf=table_conf, src_schema=src_schema, tgt_schema=tgt_schema)
|
901
|
-
except PySparkException as e:
|
902
|
-
return SchemaReconcileOutput(is_valid=False, exception=str(e))
|
903
|
-
|
904
|
-
|
905
|
-
def _run_reconcile_aggregates(
|
906
|
-
reconciler: Reconciliation,
|
907
|
-
table_conf: Table,
|
908
|
-
src_schema: list[Schema],
|
909
|
-
tgt_schema: list[Schema],
|
910
|
-
) -> list[AggregateQueryOutput]:
|
911
|
-
try:
|
912
|
-
return reconciler.reconcile_aggregates(table_conf, src_schema, tgt_schema)
|
913
|
-
except DataSourceRuntimeException as e:
|
914
|
-
return [AggregateQueryOutput(reconcile_output=DataReconcileOutput(exception=str(e)), rule=None)]
|
915
|
-
|
916
|
-
|
917
116
|
if __name__ == "__main__":
|
918
117
|
if "DATABRICKS_RUNTIME_VERSION" not in os.environ:
|
919
118
|
raise SystemExit("Only intended to run in Databricks Runtime")
|