databricks-labs-lakebridge 0.10.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- databricks/__init__.py +3 -0
- databricks/labs/__init__.py +3 -0
- databricks/labs/lakebridge/__about__.py +2 -0
- databricks/labs/lakebridge/__init__.py +11 -0
- databricks/labs/lakebridge/assessments/configure_assessment.py +194 -0
- databricks/labs/lakebridge/assessments/pipeline.py +188 -0
- databricks/labs/lakebridge/assessments/profiler_config.py +30 -0
- databricks/labs/lakebridge/base_install.py +12 -0
- databricks/labs/lakebridge/cli.py +449 -0
- databricks/labs/lakebridge/config.py +192 -0
- databricks/labs/lakebridge/connections/__init__.py +0 -0
- databricks/labs/lakebridge/connections/credential_manager.py +89 -0
- databricks/labs/lakebridge/connections/database_manager.py +98 -0
- databricks/labs/lakebridge/connections/env_getter.py +13 -0
- databricks/labs/lakebridge/contexts/__init__.py +0 -0
- databricks/labs/lakebridge/contexts/application.py +133 -0
- databricks/labs/lakebridge/coverage/__init__.py +0 -0
- databricks/labs/lakebridge/coverage/commons.py +223 -0
- databricks/labs/lakebridge/coverage/lakebridge_snow_transpilation_coverage.py +29 -0
- databricks/labs/lakebridge/coverage/local_report.py +9 -0
- databricks/labs/lakebridge/coverage/sqlglot_snow_transpilation_coverage.py +5 -0
- databricks/labs/lakebridge/coverage/sqlglot_tsql_transpilation_coverage.py +5 -0
- databricks/labs/lakebridge/deployment/__init__.py +0 -0
- databricks/labs/lakebridge/deployment/configurator.py +199 -0
- databricks/labs/lakebridge/deployment/dashboard.py +140 -0
- databricks/labs/lakebridge/deployment/installation.py +125 -0
- databricks/labs/lakebridge/deployment/job.py +147 -0
- databricks/labs/lakebridge/deployment/recon.py +145 -0
- databricks/labs/lakebridge/deployment/table.py +30 -0
- databricks/labs/lakebridge/deployment/upgrade_common.py +124 -0
- databricks/labs/lakebridge/discovery/table.py +36 -0
- databricks/labs/lakebridge/discovery/table_definition.py +23 -0
- databricks/labs/lakebridge/discovery/tsql_table_definition.py +185 -0
- databricks/labs/lakebridge/errors/exceptions.py +1 -0
- databricks/labs/lakebridge/helpers/__init__.py +0 -0
- databricks/labs/lakebridge/helpers/db_sql.py +24 -0
- databricks/labs/lakebridge/helpers/execution_time.py +20 -0
- databricks/labs/lakebridge/helpers/file_utils.py +64 -0
- databricks/labs/lakebridge/helpers/metastore.py +164 -0
- databricks/labs/lakebridge/helpers/recon_config_utils.py +176 -0
- databricks/labs/lakebridge/helpers/string_utils.py +62 -0
- databricks/labs/lakebridge/helpers/telemetry_utils.py +13 -0
- databricks/labs/lakebridge/helpers/validation.py +101 -0
- databricks/labs/lakebridge/install.py +849 -0
- databricks/labs/lakebridge/intermediate/__init__.py +0 -0
- databricks/labs/lakebridge/intermediate/dag.py +88 -0
- databricks/labs/lakebridge/intermediate/engine_adapter.py +0 -0
- databricks/labs/lakebridge/intermediate/root_tables.py +44 -0
- databricks/labs/lakebridge/jvmproxy.py +56 -0
- databricks/labs/lakebridge/lineage.py +42 -0
- databricks/labs/lakebridge/reconcile/__init__.py +0 -0
- databricks/labs/lakebridge/reconcile/compare.py +414 -0
- databricks/labs/lakebridge/reconcile/connectors/__init__.py +0 -0
- databricks/labs/lakebridge/reconcile/connectors/data_source.py +72 -0
- databricks/labs/lakebridge/reconcile/connectors/databricks.py +87 -0
- databricks/labs/lakebridge/reconcile/connectors/jdbc_reader.py +41 -0
- databricks/labs/lakebridge/reconcile/connectors/oracle.py +108 -0
- databricks/labs/lakebridge/reconcile/connectors/secrets.py +30 -0
- databricks/labs/lakebridge/reconcile/connectors/snowflake.py +173 -0
- databricks/labs/lakebridge/reconcile/connectors/source_adapter.py +30 -0
- databricks/labs/lakebridge/reconcile/connectors/sql_server.py +132 -0
- databricks/labs/lakebridge/reconcile/constants.py +37 -0
- databricks/labs/lakebridge/reconcile/exception.py +42 -0
- databricks/labs/lakebridge/reconcile/execute.py +920 -0
- databricks/labs/lakebridge/reconcile/query_builder/__init__.py +0 -0
- databricks/labs/lakebridge/reconcile/query_builder/aggregate_query.py +293 -0
- databricks/labs/lakebridge/reconcile/query_builder/base.py +138 -0
- databricks/labs/lakebridge/reconcile/query_builder/count_query.py +33 -0
- databricks/labs/lakebridge/reconcile/query_builder/expression_generator.py +292 -0
- databricks/labs/lakebridge/reconcile/query_builder/hash_query.py +91 -0
- databricks/labs/lakebridge/reconcile/query_builder/sampling_query.py +123 -0
- databricks/labs/lakebridge/reconcile/query_builder/threshold_query.py +231 -0
- databricks/labs/lakebridge/reconcile/recon_capture.py +635 -0
- databricks/labs/lakebridge/reconcile/recon_config.py +363 -0
- databricks/labs/lakebridge/reconcile/recon_output_config.py +85 -0
- databricks/labs/lakebridge/reconcile/runner.py +97 -0
- databricks/labs/lakebridge/reconcile/sampler.py +239 -0
- databricks/labs/lakebridge/reconcile/schema_compare.py +126 -0
- databricks/labs/lakebridge/resources/__init__.py +0 -0
- databricks/labs/lakebridge/resources/config/credentials.yml +33 -0
- databricks/labs/lakebridge/resources/reconcile/__init__.py +0 -0
- databricks/labs/lakebridge/resources/reconcile/dashboards/__init__.py +0 -0
- databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/00_0_aggregate_recon_header.md +6 -0
- databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/01_0_recon_id.filter.yml +6 -0
- databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/01_1_executed_by.filter.yml +5 -0
- databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/01_2_started_at.filter.yml +5 -0
- databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/02_0_source_type.filter.yml +5 -0
- databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/02_1_source_table.filter.yml +5 -0
- databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/02_2_target_table.filter.yml +5 -0
- databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/04_0_aggregate_summary_table.sql +46 -0
- databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/05_0_aggregate_recon_drilldown_header.md +2 -0
- databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/06_0_recon_id.filter.yml +5 -0
- databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/06_1_category.filter.yml +5 -0
- databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/06_2_aggregate_type.filter.yml +5 -0
- databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/07_0_target_table.filter.yml +4 -0
- databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/07_1_source_table.filter.yml +4 -0
- databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/08_0_aggregate_details_table.sql +92 -0
- databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/09_0_aggregate_missing_mismatch_header.md +1 -0
- databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/10_0_aggr_mismatched_records.sql +19 -0
- databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/11_0_aggr_missing_in_databricks.sql +19 -0
- databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/11_1_aggr_missing_in_source.sql +19 -0
- databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/dashboard.yml +365 -0
- databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/00_0_recon_main.md +3 -0
- databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/01_0_recon_id.filter.yml +6 -0
- databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/01_1_report_type.filter.yml +5 -0
- databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/01_2_executed_by.filter.yml +5 -0
- databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/02_0_source_type.filter.yml +5 -0
- databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/02_1_source_table.filter.yml +6 -0
- databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/02_2_target_table.filter.yml +6 -0
- databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/03_0_started_at.filter.yml +5 -0
- databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/05_0_summary_table.sql +38 -0
- databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/06_0_schema_comparison_header.md +3 -0
- databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/07_0_schema_details_table.sql +42 -0
- databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/08_0_drill_down_header.md +3 -0
- databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/09_0_recon_id.filter.yml +4 -0
- databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/09_1_category.filter.yml +4 -0
- databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/10_0_target_table.filter.yml +4 -0
- databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/10_1_source_table.filter.yml +4 -0
- databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/11_0_recon_details_pivot.sql +40 -0
- databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/12_0_daily_data_validation_issue_header.md +3 -0
- databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/13_0_success_fail_.filter.yml +4 -0
- databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/14_0_failed_recon_ids.sql +15 -0
- databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/15_0_total_failed_runs.sql +10 -0
- databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/15_1_failed_targets.sql +10 -0
- databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/15_2_successful_targets.sql +10 -0
- databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/16_0_missing_mismatch_header.md +1 -0
- databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/17_0_mismatched_records.sql +14 -0
- databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/17_1_threshold_mismatches.sql +14 -0
- databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/18_0_missing_in_databricks.sql +14 -0
- databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/18_1_missing_in_source.sql +14 -0
- databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/dashboard.yml +545 -0
- databricks/labs/lakebridge/resources/reconcile/queries/__init__.py +0 -0
- databricks/labs/lakebridge/resources/reconcile/queries/installation/__init__.py +0 -0
- databricks/labs/lakebridge/resources/reconcile/queries/installation/aggregate_details.sql +7 -0
- databricks/labs/lakebridge/resources/reconcile/queries/installation/aggregate_metrics.sql +15 -0
- databricks/labs/lakebridge/resources/reconcile/queries/installation/aggregate_rules.sql +6 -0
- databricks/labs/lakebridge/resources/reconcile/queries/installation/details.sql +7 -0
- databricks/labs/lakebridge/resources/reconcile/queries/installation/main.sql +24 -0
- databricks/labs/lakebridge/resources/reconcile/queries/installation/metrics.sql +21 -0
- databricks/labs/lakebridge/transpiler/__init__.py +0 -0
- databricks/labs/lakebridge/transpiler/execute.py +423 -0
- databricks/labs/lakebridge/transpiler/lsp/__init__.py +0 -0
- databricks/labs/lakebridge/transpiler/lsp/lsp_engine.py +564 -0
- databricks/labs/lakebridge/transpiler/sqlglot/__init__.py +0 -0
- databricks/labs/lakebridge/transpiler/sqlglot/dialect_utils.py +30 -0
- databricks/labs/lakebridge/transpiler/sqlglot/generator/__init__.py +0 -0
- databricks/labs/lakebridge/transpiler/sqlglot/generator/databricks.py +771 -0
- databricks/labs/lakebridge/transpiler/sqlglot/lca_utils.py +138 -0
- databricks/labs/lakebridge/transpiler/sqlglot/local_expression.py +197 -0
- databricks/labs/lakebridge/transpiler/sqlglot/parsers/__init__.py +0 -0
- databricks/labs/lakebridge/transpiler/sqlglot/parsers/oracle.py +23 -0
- databricks/labs/lakebridge/transpiler/sqlglot/parsers/presto.py +202 -0
- databricks/labs/lakebridge/transpiler/sqlglot/parsers/snowflake.py +535 -0
- databricks/labs/lakebridge/transpiler/sqlglot/sqlglot_engine.py +203 -0
- databricks/labs/lakebridge/transpiler/transpile_engine.py +49 -0
- databricks/labs/lakebridge/transpiler/transpile_status.py +68 -0
- databricks/labs/lakebridge/uninstall.py +28 -0
- databricks/labs/lakebridge/upgrades/v0.4.0_add_main_table_operation_name_column.py +80 -0
- databricks/labs/lakebridge/upgrades/v0.6.0_alter_metrics_datatype.py +51 -0
- databricks_labs_lakebridge-0.10.0.dist-info/METADATA +58 -0
- databricks_labs_lakebridge-0.10.0.dist-info/RECORD +171 -0
- databricks_labs_lakebridge-0.10.0.dist-info/WHEEL +4 -0
- databricks_labs_lakebridge-0.10.0.dist-info/entry_points.txt +2 -0
- databricks_labs_lakebridge-0.10.0.dist-info/licenses/LICENSE +69 -0
- databricks_labs_lakebridge-0.10.0.dist-info/licenses/NOTICE +42 -0
- docs/lakebridge/src/components/Button.tsx +81 -0
- docs/lakebridge/src/css/custom.css +167 -0
- docs/lakebridge/src/css/table.css +20 -0
- docs/lakebridge/src/pages/index.tsx +57 -0
- docs/lakebridge/src/theme/Footer/index.tsx +24 -0
- docs/lakebridge/src/theme/Layout/index.tsx +18 -0
@@ -0,0 +1,635 @@
|
|
1
|
+
import logging
|
2
|
+
from datetime import datetime
|
3
|
+
from functools import reduce
|
4
|
+
|
5
|
+
from pyspark.sql import DataFrame, SparkSession
|
6
|
+
from pyspark.sql.functions import col, collect_list, create_map, lit
|
7
|
+
from pyspark.sql.types import StringType, StructField, StructType
|
8
|
+
from pyspark.errors import PySparkException
|
9
|
+
from sqlglot import Dialect
|
10
|
+
|
11
|
+
from databricks.labs.lakebridge.config import DatabaseConfig, Table, ReconcileMetadataConfig
|
12
|
+
from databricks.labs.lakebridge.reconcile.recon_config import TableThresholds
|
13
|
+
from databricks.labs.lakebridge.transpiler.sqlglot.dialect_utils import get_key_from_dialect
|
14
|
+
from databricks.labs.lakebridge.reconcile.exception import (
|
15
|
+
WriteToTableException,
|
16
|
+
ReadAndWriteWithVolumeException,
|
17
|
+
CleanFromVolumeException,
|
18
|
+
)
|
19
|
+
from databricks.labs.lakebridge.reconcile.recon_output_config import (
|
20
|
+
DataReconcileOutput,
|
21
|
+
ReconcileOutput,
|
22
|
+
ReconcileProcessDuration,
|
23
|
+
ReconcileTableOutput,
|
24
|
+
SchemaReconcileOutput,
|
25
|
+
StatusOutput,
|
26
|
+
ReconcileRecordCount,
|
27
|
+
AggregateQueryOutput,
|
28
|
+
)
|
29
|
+
from databricks.sdk import WorkspaceClient
|
30
|
+
|
31
|
+
logger = logging.getLogger(__name__)
|
32
|
+
|
33
|
+
_RECON_TABLE_NAME = "main"
|
34
|
+
_RECON_METRICS_TABLE_NAME = "metrics"
|
35
|
+
_RECON_DETAILS_TABLE_NAME = "details"
|
36
|
+
_RECON_AGGREGATE_RULES_TABLE_NAME = "aggregate_rules"
|
37
|
+
_RECON_AGGREGATE_METRICS_TABLE_NAME = "aggregate_metrics"
|
38
|
+
_RECON_AGGREGATE_DETAILS_TABLE_NAME = "aggregate_details"
|
39
|
+
|
40
|
+
|
41
|
+
class ReconIntermediatePersist:
|
42
|
+
|
43
|
+
def __init__(self, spark: SparkSession, path: str):
|
44
|
+
self.spark = spark
|
45
|
+
self.path = path
|
46
|
+
|
47
|
+
def _write_unmatched_df_to_volumes(
|
48
|
+
self,
|
49
|
+
unmatched_df: DataFrame,
|
50
|
+
) -> None:
|
51
|
+
unmatched_df.write.format("parquet").mode("overwrite").save(self.path)
|
52
|
+
|
53
|
+
def _read_unmatched_df_from_volumes(self) -> DataFrame:
|
54
|
+
return self.spark.read.format("parquet").load(self.path)
|
55
|
+
|
56
|
+
def clean_unmatched_df_from_volume(self):
|
57
|
+
try:
|
58
|
+
# TODO: for now we are overwriting the intermediate cache path. We should delete the volume in future
|
59
|
+
# workspace_client.dbfs.get_status(path)
|
60
|
+
# workspace_client.dbfs.delete(path, recursive=True)
|
61
|
+
empty_df = self.spark.createDataFrame([], schema=StructType([StructField("empty", StringType(), True)]))
|
62
|
+
empty_df.write.format("parquet").mode("overwrite").save(self.path)
|
63
|
+
logger.warning(f"Unmatched DF cleaned up from {self.path} successfully.")
|
64
|
+
except PySparkException as e:
|
65
|
+
message = f"Error cleaning up unmatched DF from {self.path} volumes --> {e}"
|
66
|
+
logger.error(message)
|
67
|
+
raise CleanFromVolumeException(message) from e
|
68
|
+
|
69
|
+
def write_and_read_unmatched_df_with_volumes(
|
70
|
+
self,
|
71
|
+
unmatched_df: DataFrame,
|
72
|
+
) -> DataFrame:
|
73
|
+
try:
|
74
|
+
self._write_unmatched_df_to_volumes(unmatched_df)
|
75
|
+
return self._read_unmatched_df_from_volumes()
|
76
|
+
except PySparkException as e:
|
77
|
+
message = f"Exception in reading or writing unmatched DF with volumes {self.path} --> {e}"
|
78
|
+
logger.error(message)
|
79
|
+
raise ReadAndWriteWithVolumeException(message) from e
|
80
|
+
|
81
|
+
|
82
|
+
def _write_df_to_delta(df: DataFrame, table_name: str, mode="append"):
|
83
|
+
try:
|
84
|
+
df.write.mode(mode).saveAsTable(table_name)
|
85
|
+
logger.info(f"Data written to {table_name} successfully.")
|
86
|
+
except Exception as e:
|
87
|
+
message = f"Error writing data to {table_name}: {e}"
|
88
|
+
logger.error(message)
|
89
|
+
raise WriteToTableException(message) from e
|
90
|
+
|
91
|
+
|
92
|
+
def generate_final_reconcile_output(
|
93
|
+
recon_id: str,
|
94
|
+
spark: SparkSession,
|
95
|
+
metadata_config: ReconcileMetadataConfig = ReconcileMetadataConfig(),
|
96
|
+
local_test_run: bool = False,
|
97
|
+
) -> ReconcileOutput:
|
98
|
+
_db_prefix = "default" if local_test_run else f"{metadata_config.catalog}.{metadata_config.schema}"
|
99
|
+
recon_df = spark.sql(
|
100
|
+
f"""
|
101
|
+
SELECT
|
102
|
+
CASE
|
103
|
+
WHEN COALESCE(MAIN.SOURCE_TABLE.CATALOG, '') <> '' THEN CONCAT(MAIN.SOURCE_TABLE.CATALOG, '.', MAIN.SOURCE_TABLE.SCHEMA, '.', MAIN.SOURCE_TABLE.TABLE_NAME)
|
104
|
+
ELSE CONCAT(MAIN.SOURCE_TABLE.SCHEMA, '.', MAIN.SOURCE_TABLE.TABLE_NAME)
|
105
|
+
END AS SOURCE_TABLE,
|
106
|
+
CONCAT(MAIN.TARGET_TABLE.CATALOG, '.', MAIN.TARGET_TABLE.SCHEMA, '.', MAIN.TARGET_TABLE.TABLE_NAME) AS TARGET_TABLE,
|
107
|
+
CASE WHEN lower(MAIN.report_type) in ('all', 'row', 'data') THEN
|
108
|
+
CASE
|
109
|
+
WHEN METRICS.recon_metrics.row_comparison.missing_in_source = 0 AND METRICS.recon_metrics.row_comparison.missing_in_target = 0 THEN TRUE
|
110
|
+
ELSE FALSE
|
111
|
+
END
|
112
|
+
ELSE NULL END AS ROW,
|
113
|
+
CASE WHEN lower(MAIN.report_type) in ('all', 'data') THEN
|
114
|
+
CASE
|
115
|
+
WHEN (METRICS.run_metrics.status = true) or
|
116
|
+
(METRICS.recon_metrics.column_comparison.absolute_mismatch = 0 AND METRICS.recon_metrics.column_comparison.threshold_mismatch = 0 AND METRICS.recon_metrics.column_comparison.mismatch_columns = '') THEN TRUE
|
117
|
+
ELSE FALSE
|
118
|
+
END
|
119
|
+
ELSE NULL END AS COLUMN,
|
120
|
+
CASE WHEN lower(MAIN.report_type) in ('all', 'schema') THEN
|
121
|
+
CASE
|
122
|
+
WHEN METRICS.recon_metrics.schema_comparison = true THEN TRUE
|
123
|
+
ELSE FALSE
|
124
|
+
END
|
125
|
+
ELSE NULL END AS SCHEMA,
|
126
|
+
METRICS.run_metrics.exception_message AS EXCEPTION_MESSAGE
|
127
|
+
FROM
|
128
|
+
{_db_prefix}.{_RECON_TABLE_NAME} MAIN
|
129
|
+
INNER JOIN
|
130
|
+
{_db_prefix}.{_RECON_METRICS_TABLE_NAME} METRICS
|
131
|
+
ON
|
132
|
+
(MAIN.recon_table_id = METRICS.recon_table_id)
|
133
|
+
WHERE
|
134
|
+
MAIN.recon_id = '{recon_id}'
|
135
|
+
"""
|
136
|
+
)
|
137
|
+
table_output = []
|
138
|
+
for row in recon_df.collect():
|
139
|
+
if row.EXCEPTION_MESSAGE is not None and row.EXCEPTION_MESSAGE != "":
|
140
|
+
table_output.append(
|
141
|
+
ReconcileTableOutput(
|
142
|
+
target_table_name=row.TARGET_TABLE,
|
143
|
+
source_table_name=row.SOURCE_TABLE,
|
144
|
+
status=StatusOutput(),
|
145
|
+
exception_message=row.EXCEPTION_MESSAGE,
|
146
|
+
)
|
147
|
+
)
|
148
|
+
else:
|
149
|
+
table_output.append(
|
150
|
+
ReconcileTableOutput(
|
151
|
+
target_table_name=row.TARGET_TABLE,
|
152
|
+
source_table_name=row.SOURCE_TABLE,
|
153
|
+
status=StatusOutput(row=row.ROW, column=row.COLUMN, schema=row.SCHEMA),
|
154
|
+
exception_message=row.EXCEPTION_MESSAGE,
|
155
|
+
)
|
156
|
+
)
|
157
|
+
final_reconcile_output = ReconcileOutput(recon_id=recon_id, results=table_output)
|
158
|
+
logger.info(f"Final reconcile output: {final_reconcile_output}")
|
159
|
+
return final_reconcile_output
|
160
|
+
|
161
|
+
|
162
|
+
def generate_final_reconcile_aggregate_output(
|
163
|
+
recon_id: str,
|
164
|
+
spark: SparkSession,
|
165
|
+
metadata_config: ReconcileMetadataConfig = ReconcileMetadataConfig(),
|
166
|
+
local_test_run: bool = False,
|
167
|
+
) -> ReconcileOutput:
|
168
|
+
_db_prefix = "default" if local_test_run else f"{metadata_config.catalog}.{metadata_config.schema}"
|
169
|
+
recon_df = spark.sql(
|
170
|
+
f"""
|
171
|
+
SELECT source_table,
|
172
|
+
target_table,
|
173
|
+
EVERY(status) AS status,
|
174
|
+
ARRAY_JOIN(COLLECT_SET(exception_message), '\n') AS exception_message
|
175
|
+
FROM
|
176
|
+
(SELECT
|
177
|
+
IF(ISNULL(main.source_table.catalog)
|
178
|
+
, CONCAT_WS('.', main.source_table.schema, main.source_table.table_name)
|
179
|
+
, CONCAT_WS('.', main.source_table.catalog, main.source_table.schema, main.source_table.table_name)) AS source_table,
|
180
|
+
CONCAT_WS('.', main.target_table.catalog, main.target_table.schema, main.target_table.table_name) AS target_table,
|
181
|
+
IF(metrics.run_metrics.status='true', TRUE , FALSE) AS status,
|
182
|
+
metrics.run_metrics.exception_message AS exception_message
|
183
|
+
FROM
|
184
|
+
{_db_prefix}.{_RECON_TABLE_NAME} main
|
185
|
+
INNER JOIN
|
186
|
+
{_db_prefix}.{_RECON_AGGREGATE_METRICS_TABLE_NAME} metrics
|
187
|
+
ON
|
188
|
+
(MAIN.recon_table_id = METRICS.recon_table_id
|
189
|
+
AND MAIN.operation_name = 'aggregates-reconcile')
|
190
|
+
WHERE
|
191
|
+
MAIN.recon_id = '{recon_id}'
|
192
|
+
)
|
193
|
+
GROUP BY source_table, target_table;
|
194
|
+
"""
|
195
|
+
)
|
196
|
+
table_output = []
|
197
|
+
for row in recon_df.collect():
|
198
|
+
if row.exception_message is not None and row.exception_message != "":
|
199
|
+
table_output.append(
|
200
|
+
ReconcileTableOutput(
|
201
|
+
target_table_name=row.target_table,
|
202
|
+
source_table_name=row.source_table,
|
203
|
+
status=StatusOutput(),
|
204
|
+
exception_message=row.exception_message,
|
205
|
+
)
|
206
|
+
)
|
207
|
+
else:
|
208
|
+
table_output.append(
|
209
|
+
ReconcileTableOutput(
|
210
|
+
target_table_name=row.target_table,
|
211
|
+
source_table_name=row.source_table,
|
212
|
+
status=StatusOutput(aggregate=row.status),
|
213
|
+
exception_message=row.exception_message,
|
214
|
+
)
|
215
|
+
)
|
216
|
+
final_reconcile_output = ReconcileOutput(recon_id=recon_id, results=table_output)
|
217
|
+
logger.info(f"Final reconcile output: {final_reconcile_output}")
|
218
|
+
return final_reconcile_output
|
219
|
+
|
220
|
+
|
221
|
+
class ReconCapture:
|
222
|
+
|
223
|
+
def __init__(
|
224
|
+
self,
|
225
|
+
database_config: DatabaseConfig,
|
226
|
+
recon_id: str,
|
227
|
+
report_type: str,
|
228
|
+
source_dialect: Dialect,
|
229
|
+
ws: WorkspaceClient,
|
230
|
+
spark: SparkSession,
|
231
|
+
metadata_config: ReconcileMetadataConfig = ReconcileMetadataConfig(),
|
232
|
+
local_test_run: bool = False,
|
233
|
+
):
|
234
|
+
self.database_config = database_config
|
235
|
+
self.recon_id = recon_id
|
236
|
+
self.report_type = report_type
|
237
|
+
self.source_dialect = source_dialect
|
238
|
+
self.ws = ws
|
239
|
+
self.spark = spark
|
240
|
+
self._db_prefix = "default" if local_test_run else f"{metadata_config.catalog}.{metadata_config.schema}"
|
241
|
+
|
242
|
+
def _generate_recon_main_id(
|
243
|
+
self,
|
244
|
+
table_conf: Table,
|
245
|
+
) -> int:
|
246
|
+
full_source_table = (
|
247
|
+
f"{self.database_config.source_schema}.{table_conf.source_name}"
|
248
|
+
if self.database_config.source_catalog is None
|
249
|
+
else f"{self.database_config.source_catalog}.{self.database_config.source_schema}.{table_conf.source_name}"
|
250
|
+
)
|
251
|
+
full_target_table = (
|
252
|
+
f"{self.database_config.target_catalog}.{self.database_config.target_schema}.{table_conf.target_name}"
|
253
|
+
)
|
254
|
+
return hash(f"{self.recon_id}{full_source_table}{full_target_table}")
|
255
|
+
|
256
|
+
def _insert_into_main_table(
|
257
|
+
self,
|
258
|
+
recon_table_id: int,
|
259
|
+
table_conf: Table,
|
260
|
+
recon_process_duration: ReconcileProcessDuration,
|
261
|
+
operation_name: str = "reconcile",
|
262
|
+
) -> None:
|
263
|
+
source_dialect_key = get_key_from_dialect(self.source_dialect)
|
264
|
+
df = self.spark.sql(
|
265
|
+
f"""
|
266
|
+
select {recon_table_id} as recon_table_id,
|
267
|
+
'{self.recon_id}' as recon_id,
|
268
|
+
case
|
269
|
+
when '{source_dialect_key}' = 'databricks' then 'Databricks'
|
270
|
+
when '{source_dialect_key}' = 'snowflake' then 'Snowflake'
|
271
|
+
when '{source_dialect_key}' = 'oracle' then 'Oracle'
|
272
|
+
else '{source_dialect_key}'
|
273
|
+
end as source_type,
|
274
|
+
named_struct(
|
275
|
+
'catalog', case when '{self.database_config.source_catalog}' = 'None' then null else '{self.database_config.source_catalog}' end,
|
276
|
+
'schema', '{self.database_config.source_schema}',
|
277
|
+
'table_name', '{table_conf.source_name}'
|
278
|
+
) as source_table,
|
279
|
+
named_struct(
|
280
|
+
'catalog', '{self.database_config.target_catalog}',
|
281
|
+
'schema', '{self.database_config.target_schema}',
|
282
|
+
'table_name', '{table_conf.target_name}'
|
283
|
+
) as target_table,
|
284
|
+
'{self.report_type}' as report_type,
|
285
|
+
'{operation_name}' as operation_name,
|
286
|
+
cast('{recon_process_duration.start_ts}' as timestamp) as start_ts,
|
287
|
+
cast('{recon_process_duration.end_ts}' as timestamp) as end_ts
|
288
|
+
"""
|
289
|
+
)
|
290
|
+
_write_df_to_delta(df, f"{self._db_prefix}.{_RECON_TABLE_NAME}")
|
291
|
+
|
292
|
+
@classmethod
|
293
|
+
def _is_mismatch_within_threshold_limits(
|
294
|
+
cls, data_reconcile_output: DataReconcileOutput, table_conf: Table, record_count: ReconcileRecordCount
|
295
|
+
):
|
296
|
+
total_mismatch_count = (
|
297
|
+
data_reconcile_output.mismatch_count + data_reconcile_output.threshold_output.threshold_mismatch_count
|
298
|
+
)
|
299
|
+
logger.info(f"total_mismatch_count : {total_mismatch_count}")
|
300
|
+
logger.warning(f"reconciled_record_count : {record_count}")
|
301
|
+
# if the mismatch count is 0 then no need of checking bounds.
|
302
|
+
if total_mismatch_count == 0:
|
303
|
+
return True
|
304
|
+
# pull out table thresholds
|
305
|
+
thresholds: list[TableThresholds] = (
|
306
|
+
[threshold for threshold in table_conf.table_thresholds if threshold.model == "mismatch"]
|
307
|
+
if table_conf.table_thresholds
|
308
|
+
else []
|
309
|
+
)
|
310
|
+
# if not table thresholds are provided return false
|
311
|
+
if not thresholds:
|
312
|
+
return False
|
313
|
+
|
314
|
+
res = None
|
315
|
+
for threshold in thresholds:
|
316
|
+
mode = threshold.get_mode()
|
317
|
+
lower_bound = int(threshold.lower_bound.replace("%", ""))
|
318
|
+
upper_bound = int(threshold.upper_bound.replace("%", ""))
|
319
|
+
if mode == "absolute":
|
320
|
+
res = lower_bound <= total_mismatch_count <= upper_bound
|
321
|
+
if mode == "percentage":
|
322
|
+
lower_bound = int(round((lower_bound / 100) * record_count.source))
|
323
|
+
upper_bound = int(round((upper_bound / 100) * record_count.source))
|
324
|
+
res = lower_bound <= total_mismatch_count <= upper_bound
|
325
|
+
|
326
|
+
return res
|
327
|
+
|
328
|
+
def _insert_into_metrics_table(
|
329
|
+
self,
|
330
|
+
recon_table_id: int,
|
331
|
+
data_reconcile_output: DataReconcileOutput,
|
332
|
+
schema_reconcile_output: SchemaReconcileOutput,
|
333
|
+
table_conf: Table,
|
334
|
+
record_count: ReconcileRecordCount,
|
335
|
+
) -> None:
|
336
|
+
status = False
|
337
|
+
if data_reconcile_output.exception in {None, ''} and schema_reconcile_output.exception in {None, ''}:
|
338
|
+
status = (
|
339
|
+
# validate for both exact mismatch and threshold mismatch
|
340
|
+
self._is_mismatch_within_threshold_limits(
|
341
|
+
data_reconcile_output=data_reconcile_output, table_conf=table_conf, record_count=record_count
|
342
|
+
)
|
343
|
+
and data_reconcile_output.missing_in_src_count == 0
|
344
|
+
and data_reconcile_output.missing_in_tgt_count == 0
|
345
|
+
and schema_reconcile_output.is_valid
|
346
|
+
)
|
347
|
+
|
348
|
+
exception_msg = ""
|
349
|
+
if schema_reconcile_output.exception is not None:
|
350
|
+
exception_msg = schema_reconcile_output.exception.replace("'", '').replace('"', '')
|
351
|
+
if data_reconcile_output.exception is not None:
|
352
|
+
exception_msg = data_reconcile_output.exception.replace("'", '').replace('"', '')
|
353
|
+
|
354
|
+
insertion_time = str(datetime.now())
|
355
|
+
mismatch_columns = []
|
356
|
+
if data_reconcile_output.mismatch and data_reconcile_output.mismatch.mismatch_columns:
|
357
|
+
mismatch_columns = data_reconcile_output.mismatch.mismatch_columns
|
358
|
+
|
359
|
+
df = self.spark.sql(
|
360
|
+
f"""
|
361
|
+
select {recon_table_id} as recon_table_id,
|
362
|
+
named_struct(
|
363
|
+
'row_comparison', case when '{self.report_type.lower()}' in ('all', 'row', 'data')
|
364
|
+
and '{exception_msg}' = '' then
|
365
|
+
named_struct(
|
366
|
+
'missing_in_source', cast({data_reconcile_output.missing_in_src_count} as bigint),
|
367
|
+
'missing_in_target', cast({data_reconcile_output.missing_in_tgt_count} as bigint)
|
368
|
+
) else null end,
|
369
|
+
'column_comparison', case when '{self.report_type.lower()}' in ('all', 'data')
|
370
|
+
and '{exception_msg}' = '' then
|
371
|
+
named_struct(
|
372
|
+
'absolute_mismatch', cast({data_reconcile_output.mismatch_count} as bigint),
|
373
|
+
'threshold_mismatch', cast({data_reconcile_output.threshold_output.threshold_mismatch_count} as bigint),
|
374
|
+
'mismatch_columns', '{",".join(mismatch_columns)}'
|
375
|
+
) else null end,
|
376
|
+
'schema_comparison', case when '{self.report_type.lower()}' in ('all', 'schema')
|
377
|
+
and '{exception_msg}' = '' then
|
378
|
+
{schema_reconcile_output.is_valid} else null end
|
379
|
+
) as recon_metrics,
|
380
|
+
named_struct(
|
381
|
+
'status', {status},
|
382
|
+
'run_by_user', '{self.ws.current_user.me().user_name}',
|
383
|
+
'exception_message', "{exception_msg}"
|
384
|
+
) as run_metrics,
|
385
|
+
cast('{insertion_time}' as timestamp) as inserted_ts
|
386
|
+
"""
|
387
|
+
)
|
388
|
+
_write_df_to_delta(df, f"{self._db_prefix}.{_RECON_METRICS_TABLE_NAME}")
|
389
|
+
|
390
|
+
@classmethod
|
391
|
+
def _create_map_column(
|
392
|
+
cls,
|
393
|
+
recon_table_id: int,
|
394
|
+
df: DataFrame,
|
395
|
+
recon_type: str,
|
396
|
+
status: bool,
|
397
|
+
) -> DataFrame:
|
398
|
+
columns = df.columns
|
399
|
+
# Create a list of column names and their corresponding column values
|
400
|
+
map_args = []
|
401
|
+
for column in columns:
|
402
|
+
map_args.extend([lit(column).alias(column + "_key"), col(column).cast("string").alias(column + "_value")])
|
403
|
+
# Create a new DataFrame with a map column
|
404
|
+
df = df.select(create_map(*map_args).alias("data"))
|
405
|
+
df = (
|
406
|
+
df.withColumn("recon_table_id", lit(recon_table_id))
|
407
|
+
.withColumn("recon_type", lit(recon_type))
|
408
|
+
.withColumn("status", lit(status))
|
409
|
+
.withColumn("inserted_ts", lit(datetime.now()))
|
410
|
+
)
|
411
|
+
return (
|
412
|
+
df.groupBy("recon_table_id", "recon_type", "status", "inserted_ts")
|
413
|
+
.agg(collect_list("data").alias("data"))
|
414
|
+
.selectExpr("recon_table_id", "recon_type", "status", "data", "inserted_ts")
|
415
|
+
)
|
416
|
+
|
417
|
+
def _create_map_column_and_insert(
|
418
|
+
self,
|
419
|
+
recon_table_id: int,
|
420
|
+
df: DataFrame,
|
421
|
+
recon_type: str,
|
422
|
+
status: bool,
|
423
|
+
) -> None:
|
424
|
+
df = self._create_map_column(recon_table_id, df, recon_type, status)
|
425
|
+
_write_df_to_delta(df, f"{self._db_prefix}.{_RECON_DETAILS_TABLE_NAME}")
|
426
|
+
|
427
|
+
def _insert_into_details_table(
|
428
|
+
self,
|
429
|
+
recon_table_id: int,
|
430
|
+
reconcile_output: DataReconcileOutput,
|
431
|
+
schema_output: SchemaReconcileOutput,
|
432
|
+
):
|
433
|
+
if reconcile_output.mismatch_count > 0 and reconcile_output.mismatch.mismatch_df:
|
434
|
+
self._create_map_column_and_insert(
|
435
|
+
recon_table_id,
|
436
|
+
reconcile_output.mismatch.mismatch_df,
|
437
|
+
"mismatch",
|
438
|
+
False,
|
439
|
+
)
|
440
|
+
|
441
|
+
if reconcile_output.missing_in_src_count > 0 and reconcile_output.missing_in_src:
|
442
|
+
self._create_map_column_and_insert(
|
443
|
+
recon_table_id,
|
444
|
+
reconcile_output.missing_in_src,
|
445
|
+
"missing_in_source",
|
446
|
+
False,
|
447
|
+
)
|
448
|
+
|
449
|
+
if reconcile_output.missing_in_tgt_count > 0 and reconcile_output.missing_in_tgt:
|
450
|
+
self._create_map_column_and_insert(
|
451
|
+
recon_table_id,
|
452
|
+
reconcile_output.missing_in_tgt,
|
453
|
+
"missing_in_target",
|
454
|
+
False,
|
455
|
+
)
|
456
|
+
|
457
|
+
if (
|
458
|
+
reconcile_output.threshold_output.threshold_mismatch_count > 0
|
459
|
+
and reconcile_output.threshold_output.threshold_df
|
460
|
+
):
|
461
|
+
self._create_map_column_and_insert(
|
462
|
+
recon_table_id,
|
463
|
+
reconcile_output.threshold_output.threshold_df,
|
464
|
+
"threshold_mismatch",
|
465
|
+
False,
|
466
|
+
)
|
467
|
+
|
468
|
+
if schema_output.compare_df is not None:
|
469
|
+
self._create_map_column_and_insert(
|
470
|
+
recon_table_id, schema_output.compare_df, "schema", schema_output.is_valid
|
471
|
+
)
|
472
|
+
|
473
|
+
def _get_df(
|
474
|
+
self,
|
475
|
+
recon_table_id: int,
|
476
|
+
agg_data: DataReconcileOutput,
|
477
|
+
recon_type: str,
|
478
|
+
):
|
479
|
+
|
480
|
+
column_count = agg_data.mismatch_count
|
481
|
+
agg_df = agg_data.mismatch.mismatch_df
|
482
|
+
match recon_type:
|
483
|
+
case "missing_in_source":
|
484
|
+
column_count = agg_data.missing_in_src_count
|
485
|
+
agg_df = agg_data.missing_in_src
|
486
|
+
case "missing_in_target":
|
487
|
+
column_count = agg_data.missing_in_tgt_count
|
488
|
+
agg_df = agg_data.missing_in_tgt
|
489
|
+
|
490
|
+
if column_count > 0 and agg_df:
|
491
|
+
return self._create_map_column(
|
492
|
+
recon_table_id,
|
493
|
+
agg_df,
|
494
|
+
recon_type,
|
495
|
+
False,
|
496
|
+
)
|
497
|
+
return None
|
498
|
+
|
499
|
+
@classmethod
|
500
|
+
def _union_dataframes(cls, df_list: list[DataFrame]) -> DataFrame:
|
501
|
+
return reduce(lambda agg_df, df: agg_df.unionByName(df), df_list)
|
502
|
+
|
503
|
+
def _insert_aggregates_into_metrics_table(
|
504
|
+
self,
|
505
|
+
recon_table_id: int,
|
506
|
+
reconcile_agg_output_list: list[AggregateQueryOutput],
|
507
|
+
) -> None:
|
508
|
+
|
509
|
+
agg_metrics_df_list = []
|
510
|
+
for agg_output in reconcile_agg_output_list:
|
511
|
+
agg_data = agg_output.reconcile_output
|
512
|
+
|
513
|
+
status = False
|
514
|
+
if agg_data.exception in {None, ''}:
|
515
|
+
status = not (
|
516
|
+
agg_data.mismatch_count > 0
|
517
|
+
or agg_data.missing_in_src_count > 0
|
518
|
+
or agg_data.missing_in_tgt_count > 0
|
519
|
+
)
|
520
|
+
|
521
|
+
exception_msg = ""
|
522
|
+
if agg_data.exception is not None:
|
523
|
+
exception_msg = agg_data.exception.replace("'", '').replace('"', '')
|
524
|
+
|
525
|
+
insertion_time = str(datetime.now())
|
526
|
+
|
527
|
+
# If there is any exception while running the Query,
|
528
|
+
# each rule is stored, with the Exception message in the metrics table
|
529
|
+
assert agg_output.rule, "Aggregate Rule must be present for storing the metrics"
|
530
|
+
rule_id = hash(f"{recon_table_id}_{agg_output.rule.column_from_rule}")
|
531
|
+
|
532
|
+
agg_metrics_df = self.spark.sql(
|
533
|
+
f"""
|
534
|
+
select {recon_table_id} as recon_table_id,
|
535
|
+
{rule_id} as rule_id,
|
536
|
+
if('{exception_msg}' = '', named_struct(
|
537
|
+
'missing_in_source', {agg_data.missing_in_src_count},
|
538
|
+
'missing_in_target', {agg_data.missing_in_tgt_count},
|
539
|
+
'mismatch', {agg_data.mismatch_count}
|
540
|
+
), null) as recon_metrics,
|
541
|
+
named_struct(
|
542
|
+
'status', {status},
|
543
|
+
'run_by_user', '{self.ws.current_user.me().user_name}',
|
544
|
+
'exception_message', "{exception_msg}"
|
545
|
+
) as run_metrics,
|
546
|
+
cast('{insertion_time}' as timestamp) as inserted_ts
|
547
|
+
"""
|
548
|
+
)
|
549
|
+
agg_metrics_df_list.append(agg_metrics_df)
|
550
|
+
|
551
|
+
agg_metrics_table_df = self._union_dataframes(agg_metrics_df_list)
|
552
|
+
_write_df_to_delta(agg_metrics_table_df, f"{self._db_prefix}.{_RECON_AGGREGATE_METRICS_TABLE_NAME}")
|
553
|
+
|
554
|
+
def _insert_aggregates_into_details_table(
|
555
|
+
self, recon_table_id: int, reconcile_agg_output_list: list[AggregateQueryOutput]
|
556
|
+
):
|
557
|
+
agg_details_df_list = []
|
558
|
+
for agg_output in reconcile_agg_output_list:
|
559
|
+
agg_details_rule_df_list = []
|
560
|
+
|
561
|
+
mismatch_df = self._get_df(recon_table_id, agg_output.reconcile_output, "mismatch")
|
562
|
+
if mismatch_df and not mismatch_df.isEmpty():
|
563
|
+
agg_details_rule_df_list.append(mismatch_df)
|
564
|
+
|
565
|
+
missing_src_df = self._get_df(recon_table_id, agg_output.reconcile_output, "missing_in_source")
|
566
|
+
if missing_src_df and not missing_src_df.isEmpty():
|
567
|
+
agg_details_rule_df_list.append(missing_src_df)
|
568
|
+
|
569
|
+
missing_tgt_df = self._get_df(recon_table_id, agg_output.reconcile_output, "missing_in_target")
|
570
|
+
if missing_tgt_df and not missing_tgt_df.isEmpty():
|
571
|
+
agg_details_rule_df_list.append(missing_tgt_df)
|
572
|
+
|
573
|
+
if agg_details_rule_df_list:
|
574
|
+
agg_details_rule_df = self._union_dataframes(agg_details_rule_df_list)
|
575
|
+
if agg_output.rule:
|
576
|
+
rule_id = hash(f"{recon_table_id}_{agg_output.rule.column_from_rule}")
|
577
|
+
agg_details_rule_df = agg_details_rule_df.withColumn("rule_id", lit(rule_id)).select(
|
578
|
+
"recon_table_id", "rule_id", "recon_type", "data", "inserted_ts"
|
579
|
+
)
|
580
|
+
agg_details_df_list.append(agg_details_rule_df)
|
581
|
+
else:
|
582
|
+
logger.warning("Aggregate Details Rules are empty")
|
583
|
+
|
584
|
+
if agg_details_df_list:
|
585
|
+
agg_details_table_df = self._union_dataframes(agg_details_df_list)
|
586
|
+
_write_df_to_delta(agg_details_table_df, f"{self._db_prefix}.{_RECON_AGGREGATE_DETAILS_TABLE_NAME}")
|
587
|
+
|
588
|
+
def start(
|
589
|
+
self,
|
590
|
+
data_reconcile_output: DataReconcileOutput,
|
591
|
+
schema_reconcile_output: SchemaReconcileOutput,
|
592
|
+
table_conf: Table,
|
593
|
+
recon_process_duration: ReconcileProcessDuration,
|
594
|
+
record_count: ReconcileRecordCount,
|
595
|
+
) -> None:
|
596
|
+
recon_table_id = self._generate_recon_main_id(table_conf)
|
597
|
+
self._insert_into_main_table(recon_table_id, table_conf, recon_process_duration)
|
598
|
+
self._insert_into_metrics_table(
|
599
|
+
recon_table_id, data_reconcile_output, schema_reconcile_output, table_conf, record_count
|
600
|
+
)
|
601
|
+
self._insert_into_details_table(recon_table_id, data_reconcile_output, schema_reconcile_output)
|
602
|
+
|
603
|
+
def store_aggregates_metrics(
|
604
|
+
self,
|
605
|
+
table_conf: Table,
|
606
|
+
recon_process_duration: ReconcileProcessDuration,
|
607
|
+
reconcile_agg_output_list: list[AggregateQueryOutput],
|
608
|
+
) -> None:
|
609
|
+
recon_table_id = self._generate_recon_main_id(table_conf)
|
610
|
+
self._insert_into_main_table(recon_table_id, table_conf, recon_process_duration, 'aggregates-reconcile')
|
611
|
+
self._insert_into_rules_table(recon_table_id, reconcile_agg_output_list)
|
612
|
+
self._insert_aggregates_into_metrics_table(recon_table_id, reconcile_agg_output_list)
|
613
|
+
self._insert_aggregates_into_details_table(
|
614
|
+
recon_table_id,
|
615
|
+
reconcile_agg_output_list,
|
616
|
+
)
|
617
|
+
|
618
|
+
def _insert_into_rules_table(self, recon_table_id: int, reconcile_agg_output_list: list[AggregateQueryOutput]):
|
619
|
+
|
620
|
+
rule_df_list = []
|
621
|
+
for agg_output in reconcile_agg_output_list:
|
622
|
+
if not agg_output.rule:
|
623
|
+
logger.error("Aggregate Rule must be present for storing the rules")
|
624
|
+
continue
|
625
|
+
rule_id = hash(f"{recon_table_id}_{agg_output.rule.column_from_rule}")
|
626
|
+
rule_query = agg_output.rule.get_rule_query(rule_id)
|
627
|
+
rule_df_list.append(
|
628
|
+
self.spark.sql(rule_query)
|
629
|
+
.withColumn("inserted_ts", lit(datetime.now()))
|
630
|
+
.select("rule_id", "rule_type", "rule_info", "inserted_ts")
|
631
|
+
)
|
632
|
+
|
633
|
+
if rule_df_list:
|
634
|
+
rules_table_df = self._union_dataframes(rule_df_list)
|
635
|
+
_write_df_to_delta(rules_table_df, f"{self._db_prefix}.{_RECON_AGGREGATE_RULES_TABLE_NAME}")
|