databricks-labs-lakebridge 0.10.5__py3-none-any.whl → 0.10.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- databricks/labs/lakebridge/__about__.py +1 -1
- databricks/labs/lakebridge/analyzer/__init__.py +0 -0
- databricks/labs/lakebridge/analyzer/lakebridge_analyzer.py +95 -0
- databricks/labs/lakebridge/base_install.py +24 -3
- databricks/labs/lakebridge/cli.py +57 -72
- databricks/labs/lakebridge/config.py +1 -1
- databricks/labs/lakebridge/contexts/application.py +11 -4
- databricks/labs/lakebridge/deployment/dashboard.py +2 -1
- databricks/labs/lakebridge/deployment/installation.py +11 -11
- databricks/labs/lakebridge/deployment/job.py +2 -2
- databricks/labs/lakebridge/helpers/file_utils.py +36 -0
- databricks/labs/lakebridge/install.py +228 -278
- databricks/labs/lakebridge/reconcile/compare.py +70 -33
- databricks/labs/lakebridge/reconcile/connectors/data_source.py +19 -0
- databricks/labs/lakebridge/reconcile/connectors/databricks.py +11 -1
- databricks/labs/lakebridge/reconcile/connectors/dialect_utils.py +126 -0
- databricks/labs/lakebridge/reconcile/connectors/models.py +7 -0
- databricks/labs/lakebridge/reconcile/connectors/oracle.py +11 -1
- databricks/labs/lakebridge/reconcile/connectors/snowflake.py +14 -2
- databricks/labs/lakebridge/reconcile/connectors/tsql.py +27 -2
- databricks/labs/lakebridge/reconcile/constants.py +4 -3
- databricks/labs/lakebridge/reconcile/execute.py +9 -810
- databricks/labs/lakebridge/reconcile/normalize_recon_config_service.py +133 -0
- databricks/labs/lakebridge/reconcile/query_builder/base.py +3 -7
- databricks/labs/lakebridge/reconcile/recon_config.py +3 -0
- databricks/labs/lakebridge/reconcile/recon_output_config.py +2 -1
- databricks/labs/lakebridge/reconcile/reconciliation.py +508 -0
- databricks/labs/lakebridge/reconcile/schema_compare.py +26 -19
- databricks/labs/lakebridge/reconcile/trigger_recon_aggregate_service.py +98 -0
- databricks/labs/lakebridge/reconcile/trigger_recon_service.py +253 -0
- databricks/labs/lakebridge/reconcile/utils.py +38 -0
- databricks/labs/lakebridge/transpiler/lsp/lsp_engine.py +48 -63
- databricks/labs/lakebridge/transpiler/repository.py +123 -0
- databricks/labs/lakebridge/transpiler/sqlglot/dialect_utils.py +2 -0
- databricks/labs/lakebridge/transpiler/transpile_engine.py +0 -18
- {databricks_labs_lakebridge-0.10.5.dist-info → databricks_labs_lakebridge-0.10.7.dist-info}/METADATA +1 -1
- {databricks_labs_lakebridge-0.10.5.dist-info → databricks_labs_lakebridge-0.10.7.dist-info}/RECORD +41 -31
- {databricks_labs_lakebridge-0.10.5.dist-info → databricks_labs_lakebridge-0.10.7.dist-info}/WHEEL +0 -0
- {databricks_labs_lakebridge-0.10.5.dist-info → databricks_labs_lakebridge-0.10.7.dist-info}/entry_points.txt +0 -0
- {databricks_labs_lakebridge-0.10.5.dist-info → databricks_labs_lakebridge-0.10.7.dist-info}/licenses/LICENSE +0 -0
- {databricks_labs_lakebridge-0.10.5.dist-info → databricks_labs_lakebridge-0.10.7.dist-info}/licenses/NOTICE +0 -0
@@ -0,0 +1,133 @@
|
|
1
|
+
import dataclasses
|
2
|
+
|
3
|
+
from databricks.labs.lakebridge.reconcile.connectors.data_source import DataSource
|
4
|
+
from databricks.labs.lakebridge.reconcile.recon_config import (
|
5
|
+
Table,
|
6
|
+
Aggregate,
|
7
|
+
ColumnMapping,
|
8
|
+
Transformation,
|
9
|
+
ColumnThresholds,
|
10
|
+
)
|
11
|
+
|
12
|
+
|
13
|
+
class NormalizeReconConfigService:
|
14
|
+
def __init__(self, source: DataSource, target: DataSource):
|
15
|
+
self.source = source
|
16
|
+
self.target = target
|
17
|
+
|
18
|
+
def normalize_recon_table_config(self, table: Table) -> Table:
|
19
|
+
normalized_table = dataclasses.replace(table)
|
20
|
+
|
21
|
+
self._normalize_sampling(normalized_table)
|
22
|
+
self._normalize_aggs(normalized_table)
|
23
|
+
self._normalize_join_cols(normalized_table)
|
24
|
+
self._normalize_select_cols(normalized_table)
|
25
|
+
self._normalize_drop_cols(normalized_table)
|
26
|
+
self._normalize_col_mappings(normalized_table)
|
27
|
+
self._normalize_transformations(normalized_table)
|
28
|
+
self._normalize_col_thresholds(normalized_table)
|
29
|
+
self._normalize_jdbc_options(normalized_table)
|
30
|
+
|
31
|
+
return normalized_table
|
32
|
+
|
33
|
+
def _normalize_sampling(self, table: Table):
|
34
|
+
if table.sampling_options:
|
35
|
+
normalized_sampling = dataclasses.replace(table.sampling_options)
|
36
|
+
normalized_sampling.stratified_columns = (
|
37
|
+
[self.source.normalize_identifier(c).ansi_normalized for c in normalized_sampling.stratified_columns]
|
38
|
+
if normalized_sampling.stratified_columns
|
39
|
+
else None
|
40
|
+
)
|
41
|
+
table.sampling_options = normalized_sampling
|
42
|
+
return table
|
43
|
+
|
44
|
+
def _normalize_aggs(self, table: Table):
|
45
|
+
normalized = [self._normalize_agg(a) for a in table.aggregates] if table.aggregates else None
|
46
|
+
table.aggregates = normalized
|
47
|
+
return table
|
48
|
+
|
49
|
+
def _normalize_agg(self, agg: Aggregate) -> Aggregate:
|
50
|
+
normalized = dataclasses.replace(agg)
|
51
|
+
normalized.agg_columns = [self.source.normalize_identifier(c).ansi_normalized for c in normalized.agg_columns]
|
52
|
+
normalized.group_by_columns = (
|
53
|
+
[self.source.normalize_identifier(c).ansi_normalized for c in normalized.group_by_columns]
|
54
|
+
if normalized.group_by_columns
|
55
|
+
else None
|
56
|
+
)
|
57
|
+
return normalized
|
58
|
+
|
59
|
+
def _normalize_join_cols(self, table: Table):
|
60
|
+
table.join_columns = (
|
61
|
+
[self.source.normalize_identifier(c).ansi_normalized for c in table.join_columns]
|
62
|
+
if table.join_columns
|
63
|
+
else None
|
64
|
+
)
|
65
|
+
return table
|
66
|
+
|
67
|
+
def _normalize_select_cols(self, table: Table):
|
68
|
+
table.select_columns = (
|
69
|
+
[self.source.normalize_identifier(c).ansi_normalized for c in table.select_columns]
|
70
|
+
if table.select_columns
|
71
|
+
else None
|
72
|
+
)
|
73
|
+
return table
|
74
|
+
|
75
|
+
def _normalize_drop_cols(self, table: Table):
|
76
|
+
table.drop_columns = (
|
77
|
+
[self.source.normalize_identifier(c).ansi_normalized for c in table.drop_columns]
|
78
|
+
if table.drop_columns
|
79
|
+
else None
|
80
|
+
)
|
81
|
+
return table
|
82
|
+
|
83
|
+
def _normalize_col_mappings(self, table: Table):
|
84
|
+
table.column_mapping = (
|
85
|
+
[self._normalize_col_mapping(m) for m in table.column_mapping] if table.column_mapping else None
|
86
|
+
)
|
87
|
+
return table
|
88
|
+
|
89
|
+
def _normalize_col_mapping(self, mapping: ColumnMapping):
|
90
|
+
return ColumnMapping(
|
91
|
+
source_name=self.source.normalize_identifier(mapping.source_name).ansi_normalized,
|
92
|
+
target_name=self.target.normalize_identifier(mapping.target_name).ansi_normalized,
|
93
|
+
)
|
94
|
+
|
95
|
+
def _normalize_transformations(self, table: Table):
|
96
|
+
table.transformations = (
|
97
|
+
[self._normalize_transformation(t) for t in table.transformations] if table.transformations else None
|
98
|
+
)
|
99
|
+
return table
|
100
|
+
|
101
|
+
def _normalize_transformation(self, transform: Transformation):
|
102
|
+
"""normalize user-configured transformations
|
103
|
+
|
104
|
+
The user configures the table column and passes SQL code to transform the source table and target table.
|
105
|
+
This is useful in scenarios when the data changes e.g. migrating `datetime`s. The SQL code is not normalized
|
106
|
+
and it is the user responsibility to pass valid SQL respecting source database and target database.
|
107
|
+
"""
|
108
|
+
normalized = dataclasses.replace(transform)
|
109
|
+
normalized.column_name = self.source.normalize_identifier(transform.column_name).ansi_normalized
|
110
|
+
return normalized
|
111
|
+
|
112
|
+
def _normalize_col_thresholds(self, table: Table):
|
113
|
+
table.column_thresholds = (
|
114
|
+
[self._normalize_col_threshold(t) for t in table.column_thresholds] if table.column_thresholds else None
|
115
|
+
)
|
116
|
+
return table
|
117
|
+
|
118
|
+
def _normalize_col_threshold(self, threshold: ColumnThresholds):
|
119
|
+
normalized = dataclasses.replace(threshold)
|
120
|
+
normalized.column_name = self.source.normalize_identifier(threshold.column_name).ansi_normalized
|
121
|
+
return normalized
|
122
|
+
|
123
|
+
def _normalize_jdbc_options(self, table: Table):
|
124
|
+
if table.jdbc_reader_options:
|
125
|
+
normalized = dataclasses.replace(table.jdbc_reader_options)
|
126
|
+
normalized.partition_column = (
|
127
|
+
self.source.normalize_identifier(normalized.partition_column).ansi_normalized
|
128
|
+
if normalized.partition_column
|
129
|
+
else None
|
130
|
+
)
|
131
|
+
table.jdbc_reader_options = normalized
|
132
|
+
|
133
|
+
return table
|
@@ -4,6 +4,7 @@ from abc import ABC
|
|
4
4
|
import sqlglot.expressions as exp
|
5
5
|
from sqlglot import Dialect, parse_one
|
6
6
|
|
7
|
+
from databricks.labs.lakebridge.reconcile.connectors.data_source import DataSource
|
7
8
|
from databricks.labs.lakebridge.reconcile.exception import InvalidInputException
|
8
9
|
from databricks.labs.lakebridge.reconcile.query_builder.expression_generator import (
|
9
10
|
DataType_transform_mapping,
|
@@ -16,17 +17,12 @@ logger = logging.getLogger(__name__)
|
|
16
17
|
|
17
18
|
|
18
19
|
class QueryBuilder(ABC):
|
19
|
-
def __init__(
|
20
|
-
self,
|
21
|
-
table_conf: Table,
|
22
|
-
schema: list[Schema],
|
23
|
-
layer: str,
|
24
|
-
engine: Dialect,
|
25
|
-
):
|
20
|
+
def __init__(self, table_conf: Table, schema: list[Schema], layer: str, engine: Dialect, data_source: DataSource):
|
26
21
|
self._table_conf = table_conf
|
27
22
|
self._schema = schema
|
28
23
|
self._layer = layer
|
29
24
|
self._engine = engine
|
25
|
+
self._data_source = data_source
|
30
26
|
|
31
27
|
@property
|
32
28
|
def engine(self) -> Dialect:
|
@@ -288,8 +288,11 @@ class Table:
|
|
288
288
|
|
289
289
|
@dataclass
|
290
290
|
class Schema:
|
291
|
+
# TODO remove: This will have the value of ansi_normalized_column_name. Kept for backwards compatibility.
|
291
292
|
column_name: str
|
292
293
|
data_type: str
|
294
|
+
ansi_normalized_column_name: str
|
295
|
+
source_normalized_column_name: str
|
293
296
|
|
294
297
|
|
295
298
|
@dataclass
|
@@ -0,0 +1,508 @@
|
|
1
|
+
import logging
|
2
|
+
|
3
|
+
from pyspark.sql import DataFrame, SparkSession
|
4
|
+
from sqlglot import Dialect
|
5
|
+
|
6
|
+
from databricks.labs.lakebridge.config import (
|
7
|
+
DatabaseConfig,
|
8
|
+
ReconcileMetadataConfig,
|
9
|
+
)
|
10
|
+
from databricks.labs.lakebridge.reconcile import utils
|
11
|
+
from databricks.labs.lakebridge.reconcile.compare import (
|
12
|
+
capture_mismatch_data_and_columns,
|
13
|
+
reconcile_data,
|
14
|
+
join_aggregate_data,
|
15
|
+
reconcile_agg_data_per_rule,
|
16
|
+
)
|
17
|
+
from databricks.labs.lakebridge.reconcile.connectors.data_source import DataSource
|
18
|
+
from databricks.labs.lakebridge.reconcile.exception import (
|
19
|
+
DataSourceRuntimeException,
|
20
|
+
)
|
21
|
+
from databricks.labs.lakebridge.reconcile.query_builder.aggregate_query import AggregateQueryBuilder
|
22
|
+
from databricks.labs.lakebridge.reconcile.query_builder.count_query import CountQueryBuilder
|
23
|
+
from databricks.labs.lakebridge.reconcile.query_builder.hash_query import HashQueryBuilder
|
24
|
+
from databricks.labs.lakebridge.reconcile.query_builder.sampling_query import (
|
25
|
+
SamplingQueryBuilder,
|
26
|
+
)
|
27
|
+
from databricks.labs.lakebridge.reconcile.query_builder.threshold_query import (
|
28
|
+
ThresholdQueryBuilder,
|
29
|
+
)
|
30
|
+
from databricks.labs.lakebridge.reconcile.recon_config import (
|
31
|
+
Schema,
|
32
|
+
Table,
|
33
|
+
AggregateQueryRules,
|
34
|
+
SamplingOptions,
|
35
|
+
)
|
36
|
+
from databricks.labs.lakebridge.reconcile.recon_output_config import (
|
37
|
+
DataReconcileOutput,
|
38
|
+
ThresholdOutput,
|
39
|
+
ReconcileRecordCount,
|
40
|
+
AggregateQueryOutput,
|
41
|
+
)
|
42
|
+
from databricks.labs.lakebridge.reconcile.sampler import SamplerFactory
|
43
|
+
from databricks.labs.lakebridge.reconcile.schema_compare import SchemaCompare
|
44
|
+
from databricks.labs.lakebridge.transpiler.sqlglot.dialect_utils import get_dialect
|
45
|
+
|
46
|
+
logger = logging.getLogger(__name__)
|
47
|
+
_SAMPLE_ROWS = 50
|
48
|
+
|
49
|
+
|
50
|
+
class Reconciliation:
|
51
|
+
|
52
|
+
def __init__(
|
53
|
+
self,
|
54
|
+
source: DataSource,
|
55
|
+
target: DataSource,
|
56
|
+
database_config: DatabaseConfig,
|
57
|
+
report_type: str,
|
58
|
+
schema_comparator: SchemaCompare,
|
59
|
+
source_engine: Dialect,
|
60
|
+
spark: SparkSession,
|
61
|
+
metadata_config: ReconcileMetadataConfig,
|
62
|
+
):
|
63
|
+
self._source = source
|
64
|
+
self._target = target
|
65
|
+
self._report_type = report_type
|
66
|
+
self._database_config = database_config
|
67
|
+
self._schema_comparator = schema_comparator
|
68
|
+
self._target_engine = get_dialect("databricks")
|
69
|
+
self._source_engine = source_engine
|
70
|
+
self._spark = spark
|
71
|
+
self._metadata_config = metadata_config
|
72
|
+
|
73
|
+
@property
|
74
|
+
def source(self) -> DataSource:
|
75
|
+
return self._source
|
76
|
+
|
77
|
+
@property
|
78
|
+
def target(self) -> DataSource:
|
79
|
+
return self._target
|
80
|
+
|
81
|
+
@property
|
82
|
+
def report_type(self) -> str:
|
83
|
+
return self._report_type
|
84
|
+
|
85
|
+
def reconcile_data(
|
86
|
+
self,
|
87
|
+
table_conf: Table,
|
88
|
+
src_schema: list[Schema],
|
89
|
+
tgt_schema: list[Schema],
|
90
|
+
) -> DataReconcileOutput:
|
91
|
+
data_reconcile_output = self._get_reconcile_output(table_conf, src_schema, tgt_schema)
|
92
|
+
reconcile_output = data_reconcile_output
|
93
|
+
if self._report_type in {"data", "all"}:
|
94
|
+
reconcile_output = self._get_sample_data(table_conf, data_reconcile_output, src_schema, tgt_schema)
|
95
|
+
if table_conf.get_threshold_columns("source"):
|
96
|
+
reconcile_output.threshold_output = self._reconcile_threshold_data(table_conf, src_schema, tgt_schema)
|
97
|
+
|
98
|
+
if self._report_type == "row" and table_conf.get_threshold_columns("source"):
|
99
|
+
logger.warning("Threshold comparison is ignored for 'row' report type")
|
100
|
+
|
101
|
+
return reconcile_output
|
102
|
+
|
103
|
+
def reconcile_schema(
|
104
|
+
self,
|
105
|
+
src_schema: list[Schema],
|
106
|
+
tgt_schema: list[Schema],
|
107
|
+
table_conf: Table,
|
108
|
+
):
|
109
|
+
return self._schema_comparator.compare(src_schema, tgt_schema, self._source_engine, table_conf)
|
110
|
+
|
111
|
+
def reconcile_aggregates(
|
112
|
+
self,
|
113
|
+
table_conf: Table,
|
114
|
+
src_schema: list[Schema],
|
115
|
+
tgt_schema: list[Schema],
|
116
|
+
) -> list[AggregateQueryOutput]:
|
117
|
+
return self._get_reconcile_aggregate_output(table_conf, src_schema, tgt_schema)
|
118
|
+
|
119
|
+
def _get_reconcile_output(
|
120
|
+
self,
|
121
|
+
table_conf,
|
122
|
+
src_schema,
|
123
|
+
tgt_schema,
|
124
|
+
):
|
125
|
+
src_hash_query = HashQueryBuilder(
|
126
|
+
table_conf, src_schema, "source", self._source_engine, self._source
|
127
|
+
).build_query(report_type=self._report_type)
|
128
|
+
tgt_hash_query = HashQueryBuilder(
|
129
|
+
table_conf, tgt_schema, "target", self._source_engine, self._target
|
130
|
+
).build_query(report_type=self._report_type)
|
131
|
+
src_data = self._source.read_data(
|
132
|
+
catalog=self._database_config.source_catalog,
|
133
|
+
schema=self._database_config.source_schema,
|
134
|
+
table=table_conf.source_name,
|
135
|
+
query=src_hash_query,
|
136
|
+
options=table_conf.jdbc_reader_options,
|
137
|
+
)
|
138
|
+
tgt_data = self._target.read_data(
|
139
|
+
catalog=self._database_config.target_catalog,
|
140
|
+
schema=self._database_config.target_schema,
|
141
|
+
table=table_conf.target_name,
|
142
|
+
query=tgt_hash_query,
|
143
|
+
options=table_conf.jdbc_reader_options,
|
144
|
+
)
|
145
|
+
|
146
|
+
volume_path = utils.generate_volume_path(table_conf, self._metadata_config)
|
147
|
+
return reconcile_data(
|
148
|
+
source=src_data,
|
149
|
+
target=tgt_data,
|
150
|
+
key_columns=table_conf.join_columns,
|
151
|
+
report_type=self._report_type,
|
152
|
+
spark=self._spark,
|
153
|
+
path=volume_path,
|
154
|
+
)
|
155
|
+
|
156
|
+
def _get_reconcile_aggregate_output(
|
157
|
+
self,
|
158
|
+
table_conf,
|
159
|
+
src_schema,
|
160
|
+
tgt_schema,
|
161
|
+
):
|
162
|
+
"""
|
163
|
+
Creates a single Query, for the aggregates having the same group by columns. (Ex: 1)
|
164
|
+
If there are no group by columns, all the aggregates are clubbed together in a single query. (Ex: 2)
|
165
|
+
Examples:
|
166
|
+
1. {
|
167
|
+
"type": "MIN",
|
168
|
+
"agg_cols": ["COL1"],
|
169
|
+
"group_by_cols": ["COL4"]
|
170
|
+
},
|
171
|
+
{
|
172
|
+
"type": "MAX",
|
173
|
+
"agg_cols": ["COL2"],
|
174
|
+
"group_by_cols": ["COL9"]
|
175
|
+
},
|
176
|
+
{
|
177
|
+
"type": "COUNT",
|
178
|
+
"agg_cols": ["COL2"],
|
179
|
+
"group_by_cols": ["COL9"]
|
180
|
+
},
|
181
|
+
{
|
182
|
+
"type": "AVG",
|
183
|
+
"agg_cols": ["COL3"],
|
184
|
+
"group_by_cols": ["COL4"]
|
185
|
+
},
|
186
|
+
Query 1: SELECT MIN(COL1), AVG(COL3) FROM :table GROUP BY COL4
|
187
|
+
Rules: ID | Aggregate Type | Column | Group By Column
|
188
|
+
#1, MIN, COL1, COL4
|
189
|
+
#2, AVG, COL3, COL4
|
190
|
+
-------------------------------------------------------
|
191
|
+
Query 2: SELECT MAX(COL2), COUNT(COL2) FROM :table GROUP BY COL9
|
192
|
+
Rules: ID | Aggregate Type | Column | Group By Column
|
193
|
+
#1, MAX, COL2, COL9
|
194
|
+
#2, COUNT, COL2, COL9
|
195
|
+
2. {
|
196
|
+
"type": "MAX",
|
197
|
+
"agg_cols": ["COL1"]
|
198
|
+
},
|
199
|
+
{
|
200
|
+
"type": "SUM",
|
201
|
+
"agg_cols": ["COL2"]
|
202
|
+
},
|
203
|
+
{
|
204
|
+
"type": "MAX",
|
205
|
+
"agg_cols": ["COL3"]
|
206
|
+
}
|
207
|
+
Query: SELECT MAX(COL1), SUM(COL2), MAX(COL3) FROM :table
|
208
|
+
Rules: ID | Aggregate Type | Column | Group By Column
|
209
|
+
#1, MAX, COL1,
|
210
|
+
#2, SUM, COL2,
|
211
|
+
#3, MAX, COL3,
|
212
|
+
"""
|
213
|
+
|
214
|
+
src_query_builder = AggregateQueryBuilder(
|
215
|
+
table_conf,
|
216
|
+
src_schema,
|
217
|
+
"source",
|
218
|
+
self._source_engine,
|
219
|
+
self._source,
|
220
|
+
)
|
221
|
+
|
222
|
+
# build Aggregate queries for source,
|
223
|
+
src_agg_queries: list[AggregateQueryRules] = src_query_builder.build_queries()
|
224
|
+
|
225
|
+
# There could be one or more queries per table based on the group by columns
|
226
|
+
|
227
|
+
# build Aggregate queries for target(Databricks),
|
228
|
+
tgt_agg_queries: list[AggregateQueryRules] = AggregateQueryBuilder(
|
229
|
+
table_conf,
|
230
|
+
tgt_schema,
|
231
|
+
"target",
|
232
|
+
self._target_engine,
|
233
|
+
self._target,
|
234
|
+
).build_queries()
|
235
|
+
|
236
|
+
volume_path = utils.generate_volume_path(table_conf, self._metadata_config)
|
237
|
+
|
238
|
+
table_agg_output: list[AggregateQueryOutput] = []
|
239
|
+
|
240
|
+
# Iterate over the grouped aggregates and reconcile the data
|
241
|
+
# Zip all the keys, read the source, target data for each Aggregate query
|
242
|
+
# and reconcile on the aggregate data
|
243
|
+
# For e.g., (source_query_GRP1, target_query_GRP1), (source_query_GRP2, target_query_GRP2)
|
244
|
+
for src_query_with_rules, tgt_query_with_rules in zip(src_agg_queries, tgt_agg_queries):
|
245
|
+
# For each Aggregate query, read the Source and Target Data and add a hash column
|
246
|
+
|
247
|
+
rules_reconcile_output: list[AggregateQueryOutput] = []
|
248
|
+
src_data = None
|
249
|
+
tgt_data = None
|
250
|
+
joined_df = None
|
251
|
+
data_source_exception = None
|
252
|
+
try:
|
253
|
+
src_data = self._source.read_data(
|
254
|
+
catalog=self._database_config.source_catalog,
|
255
|
+
schema=self._database_config.source_schema,
|
256
|
+
table=table_conf.source_name,
|
257
|
+
query=src_query_with_rules.query,
|
258
|
+
options=table_conf.jdbc_reader_options,
|
259
|
+
)
|
260
|
+
tgt_data = self._target.read_data(
|
261
|
+
catalog=self._database_config.target_catalog,
|
262
|
+
schema=self._database_config.target_schema,
|
263
|
+
table=table_conf.target_name,
|
264
|
+
query=tgt_query_with_rules.query,
|
265
|
+
options=table_conf.jdbc_reader_options,
|
266
|
+
)
|
267
|
+
# Join the Source and Target Aggregated data
|
268
|
+
joined_df = join_aggregate_data(
|
269
|
+
source=src_data,
|
270
|
+
target=tgt_data,
|
271
|
+
key_columns=src_query_with_rules.group_by_columns,
|
272
|
+
spark=self._spark,
|
273
|
+
path=f"{volume_path}{src_query_with_rules.group_by_columns_as_str}",
|
274
|
+
)
|
275
|
+
except DataSourceRuntimeException as e:
|
276
|
+
data_source_exception = e
|
277
|
+
|
278
|
+
# For each Aggregated Query, reconcile the data based on the rule
|
279
|
+
for rule in src_query_with_rules.rules:
|
280
|
+
if data_source_exception:
|
281
|
+
rule_reconcile_output = DataReconcileOutput(exception=str(data_source_exception))
|
282
|
+
else:
|
283
|
+
rule_reconcile_output = reconcile_agg_data_per_rule(
|
284
|
+
joined_df, src_data.columns, tgt_data.columns, rule
|
285
|
+
)
|
286
|
+
rules_reconcile_output.append(AggregateQueryOutput(rule=rule, reconcile_output=rule_reconcile_output))
|
287
|
+
|
288
|
+
# For each table, there could be many Aggregated queries.
|
289
|
+
# Collect the list of Rule Reconcile output per each Aggregate query and append it to the list
|
290
|
+
table_agg_output.extend(rules_reconcile_output)
|
291
|
+
return table_agg_output
|
292
|
+
|
293
|
+
def _get_sample_data(
|
294
|
+
self,
|
295
|
+
table_conf,
|
296
|
+
reconcile_output,
|
297
|
+
src_schema,
|
298
|
+
tgt_schema,
|
299
|
+
):
|
300
|
+
mismatch = None
|
301
|
+
missing_in_src = None
|
302
|
+
missing_in_tgt = None
|
303
|
+
|
304
|
+
if (
|
305
|
+
reconcile_output.mismatch_count > 0
|
306
|
+
or reconcile_output.missing_in_src_count > 0
|
307
|
+
or reconcile_output.missing_in_tgt_count > 0
|
308
|
+
):
|
309
|
+
src_sampler = SamplingQueryBuilder(table_conf, src_schema, "source", self._source_engine, self._source)
|
310
|
+
tgt_sampler = SamplingQueryBuilder(table_conf, tgt_schema, "target", self._target_engine, self._target)
|
311
|
+
if reconcile_output.mismatch_count > 0:
|
312
|
+
mismatch = self._get_mismatch_data(
|
313
|
+
src_sampler,
|
314
|
+
tgt_sampler,
|
315
|
+
reconcile_output.mismatch_count,
|
316
|
+
reconcile_output.mismatch.mismatch_df,
|
317
|
+
table_conf.join_columns,
|
318
|
+
table_conf.source_name,
|
319
|
+
table_conf.target_name,
|
320
|
+
table_conf.sampling_options,
|
321
|
+
)
|
322
|
+
|
323
|
+
if reconcile_output.missing_in_src_count > 0:
|
324
|
+
missing_in_src = Reconciliation._get_missing_data(
|
325
|
+
self._target,
|
326
|
+
tgt_sampler,
|
327
|
+
reconcile_output.missing_in_src,
|
328
|
+
self._database_config.target_catalog,
|
329
|
+
self._database_config.target_schema,
|
330
|
+
table_conf.target_name,
|
331
|
+
)
|
332
|
+
|
333
|
+
if reconcile_output.missing_in_tgt_count > 0:
|
334
|
+
missing_in_tgt = Reconciliation._get_missing_data(
|
335
|
+
self._source,
|
336
|
+
src_sampler,
|
337
|
+
reconcile_output.missing_in_tgt,
|
338
|
+
self._database_config.source_catalog,
|
339
|
+
self._database_config.source_schema,
|
340
|
+
table_conf.source_name,
|
341
|
+
)
|
342
|
+
|
343
|
+
return DataReconcileOutput(
|
344
|
+
mismatch=mismatch,
|
345
|
+
mismatch_count=reconcile_output.mismatch_count,
|
346
|
+
missing_in_src_count=reconcile_output.missing_in_src_count,
|
347
|
+
missing_in_tgt_count=reconcile_output.missing_in_tgt_count,
|
348
|
+
missing_in_src=missing_in_src,
|
349
|
+
missing_in_tgt=missing_in_tgt,
|
350
|
+
)
|
351
|
+
|
352
|
+
def _get_mismatch_data(
|
353
|
+
self,
|
354
|
+
src_sampler,
|
355
|
+
tgt_sampler,
|
356
|
+
mismatch_count,
|
357
|
+
mismatch,
|
358
|
+
key_columns,
|
359
|
+
src_table: str,
|
360
|
+
tgt_table: str,
|
361
|
+
sampling_options: SamplingOptions,
|
362
|
+
):
|
363
|
+
|
364
|
+
tgt_sampling_query = tgt_sampler.build_query_with_alias()
|
365
|
+
|
366
|
+
sampling_model_target = self._target.read_data(
|
367
|
+
catalog=self._database_config.target_catalog,
|
368
|
+
schema=self._database_config.target_schema,
|
369
|
+
table=tgt_table,
|
370
|
+
query=tgt_sampling_query,
|
371
|
+
options=None,
|
372
|
+
)
|
373
|
+
|
374
|
+
# Uses pre-calculated `mismatch_count` from `reconcile_output.mismatch_count` to avoid from recomputing `mismatch` for RandomSampler.
|
375
|
+
mismatch_sampler = SamplerFactory.get_sampler(sampling_options)
|
376
|
+
df = mismatch_sampler.sample(mismatch, mismatch_count, key_columns, sampling_model_target).cache()
|
377
|
+
|
378
|
+
src_mismatch_sample_query = src_sampler.build_query(df)
|
379
|
+
tgt_mismatch_sample_query = tgt_sampler.build_query(df)
|
380
|
+
|
381
|
+
src_data = self._source.read_data(
|
382
|
+
catalog=self._database_config.source_catalog,
|
383
|
+
schema=self._database_config.source_schema,
|
384
|
+
table=src_table,
|
385
|
+
query=src_mismatch_sample_query,
|
386
|
+
options=None,
|
387
|
+
)
|
388
|
+
tgt_data = self._target.read_data(
|
389
|
+
catalog=self._database_config.target_catalog,
|
390
|
+
schema=self._database_config.target_schema,
|
391
|
+
table=tgt_table,
|
392
|
+
query=tgt_mismatch_sample_query,
|
393
|
+
options=None,
|
394
|
+
)
|
395
|
+
|
396
|
+
return capture_mismatch_data_and_columns(source=src_data, target=tgt_data, key_columns=key_columns)
|
397
|
+
|
398
|
+
def _reconcile_threshold_data(
|
399
|
+
self,
|
400
|
+
table_conf: Table,
|
401
|
+
src_schema: list[Schema],
|
402
|
+
tgt_schema: list[Schema],
|
403
|
+
):
|
404
|
+
|
405
|
+
src_data, tgt_data = self._get_threshold_data(table_conf, src_schema, tgt_schema)
|
406
|
+
|
407
|
+
source_view = f"source_{table_conf.source_name}_df_threshold_vw"
|
408
|
+
target_view = f"target_{table_conf.target_name}_df_threshold_vw"
|
409
|
+
|
410
|
+
src_data.createOrReplaceTempView(source_view)
|
411
|
+
tgt_data.createOrReplaceTempView(target_view)
|
412
|
+
|
413
|
+
return self._compute_threshold_comparison(table_conf, src_schema)
|
414
|
+
|
415
|
+
def _get_threshold_data(
|
416
|
+
self,
|
417
|
+
table_conf: Table,
|
418
|
+
src_schema: list[Schema],
|
419
|
+
tgt_schema: list[Schema],
|
420
|
+
) -> tuple[DataFrame, DataFrame]:
|
421
|
+
src_threshold_query = ThresholdQueryBuilder(
|
422
|
+
table_conf, src_schema, "source", self._source_engine, self._source
|
423
|
+
).build_threshold_query()
|
424
|
+
tgt_threshold_query = ThresholdQueryBuilder(
|
425
|
+
table_conf, tgt_schema, "target", self._target_engine, self._target
|
426
|
+
).build_threshold_query()
|
427
|
+
|
428
|
+
src_data = self._source.read_data(
|
429
|
+
catalog=self._database_config.source_catalog,
|
430
|
+
schema=self._database_config.source_schema,
|
431
|
+
table=table_conf.source_name,
|
432
|
+
query=src_threshold_query,
|
433
|
+
options=table_conf.jdbc_reader_options,
|
434
|
+
)
|
435
|
+
tgt_data = self._target.read_data(
|
436
|
+
catalog=self._database_config.target_catalog,
|
437
|
+
schema=self._database_config.target_schema,
|
438
|
+
table=table_conf.target_name,
|
439
|
+
query=tgt_threshold_query,
|
440
|
+
options=table_conf.jdbc_reader_options,
|
441
|
+
)
|
442
|
+
|
443
|
+
return src_data, tgt_data
|
444
|
+
|
445
|
+
def _compute_threshold_comparison(self, table_conf: Table, src_schema: list[Schema]) -> ThresholdOutput:
|
446
|
+
threshold_comparison_query = ThresholdQueryBuilder(
|
447
|
+
table_conf, src_schema, "target", self._target_engine, self._target
|
448
|
+
).build_comparison_query()
|
449
|
+
|
450
|
+
threshold_result = self._target.read_data(
|
451
|
+
catalog=self._database_config.target_catalog,
|
452
|
+
schema=self._database_config.target_schema,
|
453
|
+
table=table_conf.target_name,
|
454
|
+
query=threshold_comparison_query,
|
455
|
+
options=table_conf.jdbc_reader_options,
|
456
|
+
)
|
457
|
+
threshold_columns = table_conf.get_threshold_columns("source")
|
458
|
+
failed_where_cond = " OR ".join([name + "_match = 'Failed'" for name in threshold_columns])
|
459
|
+
mismatched_df = threshold_result.filter(failed_where_cond)
|
460
|
+
mismatched_count = mismatched_df.count()
|
461
|
+
threshold_df = None
|
462
|
+
if mismatched_count > 0:
|
463
|
+
threshold_df = mismatched_df.limit(_SAMPLE_ROWS)
|
464
|
+
|
465
|
+
return ThresholdOutput(threshold_df=threshold_df, threshold_mismatch_count=mismatched_count)
|
466
|
+
|
467
|
+
def get_record_count(self, table_conf: Table, report_type: str) -> ReconcileRecordCount:
|
468
|
+
if report_type != "schema":
|
469
|
+
source_count_query = CountQueryBuilder(table_conf, "source", self._source_engine).build_query()
|
470
|
+
target_count_query = CountQueryBuilder(table_conf, "target", self._target_engine).build_query()
|
471
|
+
source_count_row = self._source.read_data(
|
472
|
+
catalog=self._database_config.source_catalog,
|
473
|
+
schema=self._database_config.source_schema,
|
474
|
+
table=table_conf.source_name,
|
475
|
+
query=source_count_query,
|
476
|
+
options=None,
|
477
|
+
).first()
|
478
|
+
target_count_row = self._target.read_data(
|
479
|
+
catalog=self._database_config.target_catalog,
|
480
|
+
schema=self._database_config.target_schema,
|
481
|
+
table=table_conf.target_name,
|
482
|
+
query=target_count_query,
|
483
|
+
options=None,
|
484
|
+
).first()
|
485
|
+
|
486
|
+
source_count = int(source_count_row[0]) if source_count_row is not None else 0
|
487
|
+
target_count = int(target_count_row[0]) if target_count_row is not None else 0
|
488
|
+
|
489
|
+
return ReconcileRecordCount(source=int(source_count), target=int(target_count))
|
490
|
+
return ReconcileRecordCount()
|
491
|
+
|
492
|
+
@staticmethod
|
493
|
+
def _get_missing_data(
|
494
|
+
reader: DataSource,
|
495
|
+
sampler: SamplingQueryBuilder,
|
496
|
+
missing_df: DataFrame,
|
497
|
+
catalog: str,
|
498
|
+
schema: str,
|
499
|
+
table_name: str,
|
500
|
+
) -> DataFrame:
|
501
|
+
sample_query = sampler.build_query(missing_df)
|
502
|
+
return reader.read_data(
|
503
|
+
catalog=catalog,
|
504
|
+
schema=schema,
|
505
|
+
table=table_name,
|
506
|
+
query=sample_query,
|
507
|
+
options=None,
|
508
|
+
)
|