databricks-labs-lakebridge 0.10.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (171) hide show
  1. databricks/__init__.py +3 -0
  2. databricks/labs/__init__.py +3 -0
  3. databricks/labs/lakebridge/__about__.py +2 -0
  4. databricks/labs/lakebridge/__init__.py +11 -0
  5. databricks/labs/lakebridge/assessments/configure_assessment.py +194 -0
  6. databricks/labs/lakebridge/assessments/pipeline.py +188 -0
  7. databricks/labs/lakebridge/assessments/profiler_config.py +30 -0
  8. databricks/labs/lakebridge/base_install.py +12 -0
  9. databricks/labs/lakebridge/cli.py +449 -0
  10. databricks/labs/lakebridge/config.py +192 -0
  11. databricks/labs/lakebridge/connections/__init__.py +0 -0
  12. databricks/labs/lakebridge/connections/credential_manager.py +89 -0
  13. databricks/labs/lakebridge/connections/database_manager.py +98 -0
  14. databricks/labs/lakebridge/connections/env_getter.py +13 -0
  15. databricks/labs/lakebridge/contexts/__init__.py +0 -0
  16. databricks/labs/lakebridge/contexts/application.py +133 -0
  17. databricks/labs/lakebridge/coverage/__init__.py +0 -0
  18. databricks/labs/lakebridge/coverage/commons.py +223 -0
  19. databricks/labs/lakebridge/coverage/lakebridge_snow_transpilation_coverage.py +29 -0
  20. databricks/labs/lakebridge/coverage/local_report.py +9 -0
  21. databricks/labs/lakebridge/coverage/sqlglot_snow_transpilation_coverage.py +5 -0
  22. databricks/labs/lakebridge/coverage/sqlglot_tsql_transpilation_coverage.py +5 -0
  23. databricks/labs/lakebridge/deployment/__init__.py +0 -0
  24. databricks/labs/lakebridge/deployment/configurator.py +199 -0
  25. databricks/labs/lakebridge/deployment/dashboard.py +140 -0
  26. databricks/labs/lakebridge/deployment/installation.py +125 -0
  27. databricks/labs/lakebridge/deployment/job.py +147 -0
  28. databricks/labs/lakebridge/deployment/recon.py +145 -0
  29. databricks/labs/lakebridge/deployment/table.py +30 -0
  30. databricks/labs/lakebridge/deployment/upgrade_common.py +124 -0
  31. databricks/labs/lakebridge/discovery/table.py +36 -0
  32. databricks/labs/lakebridge/discovery/table_definition.py +23 -0
  33. databricks/labs/lakebridge/discovery/tsql_table_definition.py +185 -0
  34. databricks/labs/lakebridge/errors/exceptions.py +1 -0
  35. databricks/labs/lakebridge/helpers/__init__.py +0 -0
  36. databricks/labs/lakebridge/helpers/db_sql.py +24 -0
  37. databricks/labs/lakebridge/helpers/execution_time.py +20 -0
  38. databricks/labs/lakebridge/helpers/file_utils.py +64 -0
  39. databricks/labs/lakebridge/helpers/metastore.py +164 -0
  40. databricks/labs/lakebridge/helpers/recon_config_utils.py +176 -0
  41. databricks/labs/lakebridge/helpers/string_utils.py +62 -0
  42. databricks/labs/lakebridge/helpers/telemetry_utils.py +13 -0
  43. databricks/labs/lakebridge/helpers/validation.py +101 -0
  44. databricks/labs/lakebridge/install.py +849 -0
  45. databricks/labs/lakebridge/intermediate/__init__.py +0 -0
  46. databricks/labs/lakebridge/intermediate/dag.py +88 -0
  47. databricks/labs/lakebridge/intermediate/engine_adapter.py +0 -0
  48. databricks/labs/lakebridge/intermediate/root_tables.py +44 -0
  49. databricks/labs/lakebridge/jvmproxy.py +56 -0
  50. databricks/labs/lakebridge/lineage.py +42 -0
  51. databricks/labs/lakebridge/reconcile/__init__.py +0 -0
  52. databricks/labs/lakebridge/reconcile/compare.py +414 -0
  53. databricks/labs/lakebridge/reconcile/connectors/__init__.py +0 -0
  54. databricks/labs/lakebridge/reconcile/connectors/data_source.py +72 -0
  55. databricks/labs/lakebridge/reconcile/connectors/databricks.py +87 -0
  56. databricks/labs/lakebridge/reconcile/connectors/jdbc_reader.py +41 -0
  57. databricks/labs/lakebridge/reconcile/connectors/oracle.py +108 -0
  58. databricks/labs/lakebridge/reconcile/connectors/secrets.py +30 -0
  59. databricks/labs/lakebridge/reconcile/connectors/snowflake.py +173 -0
  60. databricks/labs/lakebridge/reconcile/connectors/source_adapter.py +30 -0
  61. databricks/labs/lakebridge/reconcile/connectors/sql_server.py +132 -0
  62. databricks/labs/lakebridge/reconcile/constants.py +37 -0
  63. databricks/labs/lakebridge/reconcile/exception.py +42 -0
  64. databricks/labs/lakebridge/reconcile/execute.py +920 -0
  65. databricks/labs/lakebridge/reconcile/query_builder/__init__.py +0 -0
  66. databricks/labs/lakebridge/reconcile/query_builder/aggregate_query.py +293 -0
  67. databricks/labs/lakebridge/reconcile/query_builder/base.py +138 -0
  68. databricks/labs/lakebridge/reconcile/query_builder/count_query.py +33 -0
  69. databricks/labs/lakebridge/reconcile/query_builder/expression_generator.py +292 -0
  70. databricks/labs/lakebridge/reconcile/query_builder/hash_query.py +91 -0
  71. databricks/labs/lakebridge/reconcile/query_builder/sampling_query.py +123 -0
  72. databricks/labs/lakebridge/reconcile/query_builder/threshold_query.py +231 -0
  73. databricks/labs/lakebridge/reconcile/recon_capture.py +635 -0
  74. databricks/labs/lakebridge/reconcile/recon_config.py +363 -0
  75. databricks/labs/lakebridge/reconcile/recon_output_config.py +85 -0
  76. databricks/labs/lakebridge/reconcile/runner.py +97 -0
  77. databricks/labs/lakebridge/reconcile/sampler.py +239 -0
  78. databricks/labs/lakebridge/reconcile/schema_compare.py +126 -0
  79. databricks/labs/lakebridge/resources/__init__.py +0 -0
  80. databricks/labs/lakebridge/resources/config/credentials.yml +33 -0
  81. databricks/labs/lakebridge/resources/reconcile/__init__.py +0 -0
  82. databricks/labs/lakebridge/resources/reconcile/dashboards/__init__.py +0 -0
  83. databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/00_0_aggregate_recon_header.md +6 -0
  84. databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/01_0_recon_id.filter.yml +6 -0
  85. databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/01_1_executed_by.filter.yml +5 -0
  86. databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/01_2_started_at.filter.yml +5 -0
  87. databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/02_0_source_type.filter.yml +5 -0
  88. databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/02_1_source_table.filter.yml +5 -0
  89. databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/02_2_target_table.filter.yml +5 -0
  90. databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/04_0_aggregate_summary_table.sql +46 -0
  91. databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/05_0_aggregate_recon_drilldown_header.md +2 -0
  92. databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/06_0_recon_id.filter.yml +5 -0
  93. databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/06_1_category.filter.yml +5 -0
  94. databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/06_2_aggregate_type.filter.yml +5 -0
  95. databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/07_0_target_table.filter.yml +4 -0
  96. databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/07_1_source_table.filter.yml +4 -0
  97. databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/08_0_aggregate_details_table.sql +92 -0
  98. databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/09_0_aggregate_missing_mismatch_header.md +1 -0
  99. databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/10_0_aggr_mismatched_records.sql +19 -0
  100. databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/11_0_aggr_missing_in_databricks.sql +19 -0
  101. databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/11_1_aggr_missing_in_source.sql +19 -0
  102. databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/dashboard.yml +365 -0
  103. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/00_0_recon_main.md +3 -0
  104. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/01_0_recon_id.filter.yml +6 -0
  105. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/01_1_report_type.filter.yml +5 -0
  106. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/01_2_executed_by.filter.yml +5 -0
  107. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/02_0_source_type.filter.yml +5 -0
  108. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/02_1_source_table.filter.yml +6 -0
  109. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/02_2_target_table.filter.yml +6 -0
  110. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/03_0_started_at.filter.yml +5 -0
  111. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/05_0_summary_table.sql +38 -0
  112. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/06_0_schema_comparison_header.md +3 -0
  113. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/07_0_schema_details_table.sql +42 -0
  114. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/08_0_drill_down_header.md +3 -0
  115. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/09_0_recon_id.filter.yml +4 -0
  116. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/09_1_category.filter.yml +4 -0
  117. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/10_0_target_table.filter.yml +4 -0
  118. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/10_1_source_table.filter.yml +4 -0
  119. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/11_0_recon_details_pivot.sql +40 -0
  120. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/12_0_daily_data_validation_issue_header.md +3 -0
  121. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/13_0_success_fail_.filter.yml +4 -0
  122. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/14_0_failed_recon_ids.sql +15 -0
  123. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/15_0_total_failed_runs.sql +10 -0
  124. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/15_1_failed_targets.sql +10 -0
  125. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/15_2_successful_targets.sql +10 -0
  126. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/16_0_missing_mismatch_header.md +1 -0
  127. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/17_0_mismatched_records.sql +14 -0
  128. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/17_1_threshold_mismatches.sql +14 -0
  129. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/18_0_missing_in_databricks.sql +14 -0
  130. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/18_1_missing_in_source.sql +14 -0
  131. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/dashboard.yml +545 -0
  132. databricks/labs/lakebridge/resources/reconcile/queries/__init__.py +0 -0
  133. databricks/labs/lakebridge/resources/reconcile/queries/installation/__init__.py +0 -0
  134. databricks/labs/lakebridge/resources/reconcile/queries/installation/aggregate_details.sql +7 -0
  135. databricks/labs/lakebridge/resources/reconcile/queries/installation/aggregate_metrics.sql +15 -0
  136. databricks/labs/lakebridge/resources/reconcile/queries/installation/aggregate_rules.sql +6 -0
  137. databricks/labs/lakebridge/resources/reconcile/queries/installation/details.sql +7 -0
  138. databricks/labs/lakebridge/resources/reconcile/queries/installation/main.sql +24 -0
  139. databricks/labs/lakebridge/resources/reconcile/queries/installation/metrics.sql +21 -0
  140. databricks/labs/lakebridge/transpiler/__init__.py +0 -0
  141. databricks/labs/lakebridge/transpiler/execute.py +423 -0
  142. databricks/labs/lakebridge/transpiler/lsp/__init__.py +0 -0
  143. databricks/labs/lakebridge/transpiler/lsp/lsp_engine.py +564 -0
  144. databricks/labs/lakebridge/transpiler/sqlglot/__init__.py +0 -0
  145. databricks/labs/lakebridge/transpiler/sqlglot/dialect_utils.py +30 -0
  146. databricks/labs/lakebridge/transpiler/sqlglot/generator/__init__.py +0 -0
  147. databricks/labs/lakebridge/transpiler/sqlglot/generator/databricks.py +771 -0
  148. databricks/labs/lakebridge/transpiler/sqlglot/lca_utils.py +138 -0
  149. databricks/labs/lakebridge/transpiler/sqlglot/local_expression.py +197 -0
  150. databricks/labs/lakebridge/transpiler/sqlglot/parsers/__init__.py +0 -0
  151. databricks/labs/lakebridge/transpiler/sqlglot/parsers/oracle.py +23 -0
  152. databricks/labs/lakebridge/transpiler/sqlglot/parsers/presto.py +202 -0
  153. databricks/labs/lakebridge/transpiler/sqlglot/parsers/snowflake.py +535 -0
  154. databricks/labs/lakebridge/transpiler/sqlglot/sqlglot_engine.py +203 -0
  155. databricks/labs/lakebridge/transpiler/transpile_engine.py +49 -0
  156. databricks/labs/lakebridge/transpiler/transpile_status.py +68 -0
  157. databricks/labs/lakebridge/uninstall.py +28 -0
  158. databricks/labs/lakebridge/upgrades/v0.4.0_add_main_table_operation_name_column.py +80 -0
  159. databricks/labs/lakebridge/upgrades/v0.6.0_alter_metrics_datatype.py +51 -0
  160. databricks_labs_lakebridge-0.10.0.dist-info/METADATA +58 -0
  161. databricks_labs_lakebridge-0.10.0.dist-info/RECORD +171 -0
  162. databricks_labs_lakebridge-0.10.0.dist-info/WHEEL +4 -0
  163. databricks_labs_lakebridge-0.10.0.dist-info/entry_points.txt +2 -0
  164. databricks_labs_lakebridge-0.10.0.dist-info/licenses/LICENSE +69 -0
  165. databricks_labs_lakebridge-0.10.0.dist-info/licenses/NOTICE +42 -0
  166. docs/lakebridge/src/components/Button.tsx +81 -0
  167. docs/lakebridge/src/css/custom.css +167 -0
  168. docs/lakebridge/src/css/table.css +20 -0
  169. docs/lakebridge/src/pages/index.tsx +57 -0
  170. docs/lakebridge/src/theme/Footer/index.tsx +24 -0
  171. docs/lakebridge/src/theme/Layout/index.tsx +18 -0
@@ -0,0 +1,635 @@
1
+ import logging
2
+ from datetime import datetime
3
+ from functools import reduce
4
+
5
+ from pyspark.sql import DataFrame, SparkSession
6
+ from pyspark.sql.functions import col, collect_list, create_map, lit
7
+ from pyspark.sql.types import StringType, StructField, StructType
8
+ from pyspark.errors import PySparkException
9
+ from sqlglot import Dialect
10
+
11
+ from databricks.labs.lakebridge.config import DatabaseConfig, Table, ReconcileMetadataConfig
12
+ from databricks.labs.lakebridge.reconcile.recon_config import TableThresholds
13
+ from databricks.labs.lakebridge.transpiler.sqlglot.dialect_utils import get_key_from_dialect
14
+ from databricks.labs.lakebridge.reconcile.exception import (
15
+ WriteToTableException,
16
+ ReadAndWriteWithVolumeException,
17
+ CleanFromVolumeException,
18
+ )
19
+ from databricks.labs.lakebridge.reconcile.recon_output_config import (
20
+ DataReconcileOutput,
21
+ ReconcileOutput,
22
+ ReconcileProcessDuration,
23
+ ReconcileTableOutput,
24
+ SchemaReconcileOutput,
25
+ StatusOutput,
26
+ ReconcileRecordCount,
27
+ AggregateQueryOutput,
28
+ )
29
+ from databricks.sdk import WorkspaceClient
30
+
31
+ logger = logging.getLogger(__name__)
32
+
33
+ _RECON_TABLE_NAME = "main"
34
+ _RECON_METRICS_TABLE_NAME = "metrics"
35
+ _RECON_DETAILS_TABLE_NAME = "details"
36
+ _RECON_AGGREGATE_RULES_TABLE_NAME = "aggregate_rules"
37
+ _RECON_AGGREGATE_METRICS_TABLE_NAME = "aggregate_metrics"
38
+ _RECON_AGGREGATE_DETAILS_TABLE_NAME = "aggregate_details"
39
+
40
+
41
+ class ReconIntermediatePersist:
42
+
43
+ def __init__(self, spark: SparkSession, path: str):
44
+ self.spark = spark
45
+ self.path = path
46
+
47
+ def _write_unmatched_df_to_volumes(
48
+ self,
49
+ unmatched_df: DataFrame,
50
+ ) -> None:
51
+ unmatched_df.write.format("parquet").mode("overwrite").save(self.path)
52
+
53
+ def _read_unmatched_df_from_volumes(self) -> DataFrame:
54
+ return self.spark.read.format("parquet").load(self.path)
55
+
56
+ def clean_unmatched_df_from_volume(self):
57
+ try:
58
+ # TODO: for now we are overwriting the intermediate cache path. We should delete the volume in future
59
+ # workspace_client.dbfs.get_status(path)
60
+ # workspace_client.dbfs.delete(path, recursive=True)
61
+ empty_df = self.spark.createDataFrame([], schema=StructType([StructField("empty", StringType(), True)]))
62
+ empty_df.write.format("parquet").mode("overwrite").save(self.path)
63
+ logger.warning(f"Unmatched DF cleaned up from {self.path} successfully.")
64
+ except PySparkException as e:
65
+ message = f"Error cleaning up unmatched DF from {self.path} volumes --> {e}"
66
+ logger.error(message)
67
+ raise CleanFromVolumeException(message) from e
68
+
69
+ def write_and_read_unmatched_df_with_volumes(
70
+ self,
71
+ unmatched_df: DataFrame,
72
+ ) -> DataFrame:
73
+ try:
74
+ self._write_unmatched_df_to_volumes(unmatched_df)
75
+ return self._read_unmatched_df_from_volumes()
76
+ except PySparkException as e:
77
+ message = f"Exception in reading or writing unmatched DF with volumes {self.path} --> {e}"
78
+ logger.error(message)
79
+ raise ReadAndWriteWithVolumeException(message) from e
80
+
81
+
82
+ def _write_df_to_delta(df: DataFrame, table_name: str, mode="append"):
83
+ try:
84
+ df.write.mode(mode).saveAsTable(table_name)
85
+ logger.info(f"Data written to {table_name} successfully.")
86
+ except Exception as e:
87
+ message = f"Error writing data to {table_name}: {e}"
88
+ logger.error(message)
89
+ raise WriteToTableException(message) from e
90
+
91
+
92
+ def generate_final_reconcile_output(
93
+ recon_id: str,
94
+ spark: SparkSession,
95
+ metadata_config: ReconcileMetadataConfig = ReconcileMetadataConfig(),
96
+ local_test_run: bool = False,
97
+ ) -> ReconcileOutput:
98
+ _db_prefix = "default" if local_test_run else f"{metadata_config.catalog}.{metadata_config.schema}"
99
+ recon_df = spark.sql(
100
+ f"""
101
+ SELECT
102
+ CASE
103
+ WHEN COALESCE(MAIN.SOURCE_TABLE.CATALOG, '') <> '' THEN CONCAT(MAIN.SOURCE_TABLE.CATALOG, '.', MAIN.SOURCE_TABLE.SCHEMA, '.', MAIN.SOURCE_TABLE.TABLE_NAME)
104
+ ELSE CONCAT(MAIN.SOURCE_TABLE.SCHEMA, '.', MAIN.SOURCE_TABLE.TABLE_NAME)
105
+ END AS SOURCE_TABLE,
106
+ CONCAT(MAIN.TARGET_TABLE.CATALOG, '.', MAIN.TARGET_TABLE.SCHEMA, '.', MAIN.TARGET_TABLE.TABLE_NAME) AS TARGET_TABLE,
107
+ CASE WHEN lower(MAIN.report_type) in ('all', 'row', 'data') THEN
108
+ CASE
109
+ WHEN METRICS.recon_metrics.row_comparison.missing_in_source = 0 AND METRICS.recon_metrics.row_comparison.missing_in_target = 0 THEN TRUE
110
+ ELSE FALSE
111
+ END
112
+ ELSE NULL END AS ROW,
113
+ CASE WHEN lower(MAIN.report_type) in ('all', 'data') THEN
114
+ CASE
115
+ WHEN (METRICS.run_metrics.status = true) or
116
+ (METRICS.recon_metrics.column_comparison.absolute_mismatch = 0 AND METRICS.recon_metrics.column_comparison.threshold_mismatch = 0 AND METRICS.recon_metrics.column_comparison.mismatch_columns = '') THEN TRUE
117
+ ELSE FALSE
118
+ END
119
+ ELSE NULL END AS COLUMN,
120
+ CASE WHEN lower(MAIN.report_type) in ('all', 'schema') THEN
121
+ CASE
122
+ WHEN METRICS.recon_metrics.schema_comparison = true THEN TRUE
123
+ ELSE FALSE
124
+ END
125
+ ELSE NULL END AS SCHEMA,
126
+ METRICS.run_metrics.exception_message AS EXCEPTION_MESSAGE
127
+ FROM
128
+ {_db_prefix}.{_RECON_TABLE_NAME} MAIN
129
+ INNER JOIN
130
+ {_db_prefix}.{_RECON_METRICS_TABLE_NAME} METRICS
131
+ ON
132
+ (MAIN.recon_table_id = METRICS.recon_table_id)
133
+ WHERE
134
+ MAIN.recon_id = '{recon_id}'
135
+ """
136
+ )
137
+ table_output = []
138
+ for row in recon_df.collect():
139
+ if row.EXCEPTION_MESSAGE is not None and row.EXCEPTION_MESSAGE != "":
140
+ table_output.append(
141
+ ReconcileTableOutput(
142
+ target_table_name=row.TARGET_TABLE,
143
+ source_table_name=row.SOURCE_TABLE,
144
+ status=StatusOutput(),
145
+ exception_message=row.EXCEPTION_MESSAGE,
146
+ )
147
+ )
148
+ else:
149
+ table_output.append(
150
+ ReconcileTableOutput(
151
+ target_table_name=row.TARGET_TABLE,
152
+ source_table_name=row.SOURCE_TABLE,
153
+ status=StatusOutput(row=row.ROW, column=row.COLUMN, schema=row.SCHEMA),
154
+ exception_message=row.EXCEPTION_MESSAGE,
155
+ )
156
+ )
157
+ final_reconcile_output = ReconcileOutput(recon_id=recon_id, results=table_output)
158
+ logger.info(f"Final reconcile output: {final_reconcile_output}")
159
+ return final_reconcile_output
160
+
161
+
162
+ def generate_final_reconcile_aggregate_output(
163
+ recon_id: str,
164
+ spark: SparkSession,
165
+ metadata_config: ReconcileMetadataConfig = ReconcileMetadataConfig(),
166
+ local_test_run: bool = False,
167
+ ) -> ReconcileOutput:
168
+ _db_prefix = "default" if local_test_run else f"{metadata_config.catalog}.{metadata_config.schema}"
169
+ recon_df = spark.sql(
170
+ f"""
171
+ SELECT source_table,
172
+ target_table,
173
+ EVERY(status) AS status,
174
+ ARRAY_JOIN(COLLECT_SET(exception_message), '\n') AS exception_message
175
+ FROM
176
+ (SELECT
177
+ IF(ISNULL(main.source_table.catalog)
178
+ , CONCAT_WS('.', main.source_table.schema, main.source_table.table_name)
179
+ , CONCAT_WS('.', main.source_table.catalog, main.source_table.schema, main.source_table.table_name)) AS source_table,
180
+ CONCAT_WS('.', main.target_table.catalog, main.target_table.schema, main.target_table.table_name) AS target_table,
181
+ IF(metrics.run_metrics.status='true', TRUE , FALSE) AS status,
182
+ metrics.run_metrics.exception_message AS exception_message
183
+ FROM
184
+ {_db_prefix}.{_RECON_TABLE_NAME} main
185
+ INNER JOIN
186
+ {_db_prefix}.{_RECON_AGGREGATE_METRICS_TABLE_NAME} metrics
187
+ ON
188
+ (MAIN.recon_table_id = METRICS.recon_table_id
189
+ AND MAIN.operation_name = 'aggregates-reconcile')
190
+ WHERE
191
+ MAIN.recon_id = '{recon_id}'
192
+ )
193
+ GROUP BY source_table, target_table;
194
+ """
195
+ )
196
+ table_output = []
197
+ for row in recon_df.collect():
198
+ if row.exception_message is not None and row.exception_message != "":
199
+ table_output.append(
200
+ ReconcileTableOutput(
201
+ target_table_name=row.target_table,
202
+ source_table_name=row.source_table,
203
+ status=StatusOutput(),
204
+ exception_message=row.exception_message,
205
+ )
206
+ )
207
+ else:
208
+ table_output.append(
209
+ ReconcileTableOutput(
210
+ target_table_name=row.target_table,
211
+ source_table_name=row.source_table,
212
+ status=StatusOutput(aggregate=row.status),
213
+ exception_message=row.exception_message,
214
+ )
215
+ )
216
+ final_reconcile_output = ReconcileOutput(recon_id=recon_id, results=table_output)
217
+ logger.info(f"Final reconcile output: {final_reconcile_output}")
218
+ return final_reconcile_output
219
+
220
+
221
+ class ReconCapture:
222
+
223
+ def __init__(
224
+ self,
225
+ database_config: DatabaseConfig,
226
+ recon_id: str,
227
+ report_type: str,
228
+ source_dialect: Dialect,
229
+ ws: WorkspaceClient,
230
+ spark: SparkSession,
231
+ metadata_config: ReconcileMetadataConfig = ReconcileMetadataConfig(),
232
+ local_test_run: bool = False,
233
+ ):
234
+ self.database_config = database_config
235
+ self.recon_id = recon_id
236
+ self.report_type = report_type
237
+ self.source_dialect = source_dialect
238
+ self.ws = ws
239
+ self.spark = spark
240
+ self._db_prefix = "default" if local_test_run else f"{metadata_config.catalog}.{metadata_config.schema}"
241
+
242
+ def _generate_recon_main_id(
243
+ self,
244
+ table_conf: Table,
245
+ ) -> int:
246
+ full_source_table = (
247
+ f"{self.database_config.source_schema}.{table_conf.source_name}"
248
+ if self.database_config.source_catalog is None
249
+ else f"{self.database_config.source_catalog}.{self.database_config.source_schema}.{table_conf.source_name}"
250
+ )
251
+ full_target_table = (
252
+ f"{self.database_config.target_catalog}.{self.database_config.target_schema}.{table_conf.target_name}"
253
+ )
254
+ return hash(f"{self.recon_id}{full_source_table}{full_target_table}")
255
+
256
+ def _insert_into_main_table(
257
+ self,
258
+ recon_table_id: int,
259
+ table_conf: Table,
260
+ recon_process_duration: ReconcileProcessDuration,
261
+ operation_name: str = "reconcile",
262
+ ) -> None:
263
+ source_dialect_key = get_key_from_dialect(self.source_dialect)
264
+ df = self.spark.sql(
265
+ f"""
266
+ select {recon_table_id} as recon_table_id,
267
+ '{self.recon_id}' as recon_id,
268
+ case
269
+ when '{source_dialect_key}' = 'databricks' then 'Databricks'
270
+ when '{source_dialect_key}' = 'snowflake' then 'Snowflake'
271
+ when '{source_dialect_key}' = 'oracle' then 'Oracle'
272
+ else '{source_dialect_key}'
273
+ end as source_type,
274
+ named_struct(
275
+ 'catalog', case when '{self.database_config.source_catalog}' = 'None' then null else '{self.database_config.source_catalog}' end,
276
+ 'schema', '{self.database_config.source_schema}',
277
+ 'table_name', '{table_conf.source_name}'
278
+ ) as source_table,
279
+ named_struct(
280
+ 'catalog', '{self.database_config.target_catalog}',
281
+ 'schema', '{self.database_config.target_schema}',
282
+ 'table_name', '{table_conf.target_name}'
283
+ ) as target_table,
284
+ '{self.report_type}' as report_type,
285
+ '{operation_name}' as operation_name,
286
+ cast('{recon_process_duration.start_ts}' as timestamp) as start_ts,
287
+ cast('{recon_process_duration.end_ts}' as timestamp) as end_ts
288
+ """
289
+ )
290
+ _write_df_to_delta(df, f"{self._db_prefix}.{_RECON_TABLE_NAME}")
291
+
292
+ @classmethod
293
+ def _is_mismatch_within_threshold_limits(
294
+ cls, data_reconcile_output: DataReconcileOutput, table_conf: Table, record_count: ReconcileRecordCount
295
+ ):
296
+ total_mismatch_count = (
297
+ data_reconcile_output.mismatch_count + data_reconcile_output.threshold_output.threshold_mismatch_count
298
+ )
299
+ logger.info(f"total_mismatch_count : {total_mismatch_count}")
300
+ logger.warning(f"reconciled_record_count : {record_count}")
301
+ # if the mismatch count is 0 then no need of checking bounds.
302
+ if total_mismatch_count == 0:
303
+ return True
304
+ # pull out table thresholds
305
+ thresholds: list[TableThresholds] = (
306
+ [threshold for threshold in table_conf.table_thresholds if threshold.model == "mismatch"]
307
+ if table_conf.table_thresholds
308
+ else []
309
+ )
310
+ # if not table thresholds are provided return false
311
+ if not thresholds:
312
+ return False
313
+
314
+ res = None
315
+ for threshold in thresholds:
316
+ mode = threshold.get_mode()
317
+ lower_bound = int(threshold.lower_bound.replace("%", ""))
318
+ upper_bound = int(threshold.upper_bound.replace("%", ""))
319
+ if mode == "absolute":
320
+ res = lower_bound <= total_mismatch_count <= upper_bound
321
+ if mode == "percentage":
322
+ lower_bound = int(round((lower_bound / 100) * record_count.source))
323
+ upper_bound = int(round((upper_bound / 100) * record_count.source))
324
+ res = lower_bound <= total_mismatch_count <= upper_bound
325
+
326
+ return res
327
+
328
+ def _insert_into_metrics_table(
329
+ self,
330
+ recon_table_id: int,
331
+ data_reconcile_output: DataReconcileOutput,
332
+ schema_reconcile_output: SchemaReconcileOutput,
333
+ table_conf: Table,
334
+ record_count: ReconcileRecordCount,
335
+ ) -> None:
336
+ status = False
337
+ if data_reconcile_output.exception in {None, ''} and schema_reconcile_output.exception in {None, ''}:
338
+ status = (
339
+ # validate for both exact mismatch and threshold mismatch
340
+ self._is_mismatch_within_threshold_limits(
341
+ data_reconcile_output=data_reconcile_output, table_conf=table_conf, record_count=record_count
342
+ )
343
+ and data_reconcile_output.missing_in_src_count == 0
344
+ and data_reconcile_output.missing_in_tgt_count == 0
345
+ and schema_reconcile_output.is_valid
346
+ )
347
+
348
+ exception_msg = ""
349
+ if schema_reconcile_output.exception is not None:
350
+ exception_msg = schema_reconcile_output.exception.replace("'", '').replace('"', '')
351
+ if data_reconcile_output.exception is not None:
352
+ exception_msg = data_reconcile_output.exception.replace("'", '').replace('"', '')
353
+
354
+ insertion_time = str(datetime.now())
355
+ mismatch_columns = []
356
+ if data_reconcile_output.mismatch and data_reconcile_output.mismatch.mismatch_columns:
357
+ mismatch_columns = data_reconcile_output.mismatch.mismatch_columns
358
+
359
+ df = self.spark.sql(
360
+ f"""
361
+ select {recon_table_id} as recon_table_id,
362
+ named_struct(
363
+ 'row_comparison', case when '{self.report_type.lower()}' in ('all', 'row', 'data')
364
+ and '{exception_msg}' = '' then
365
+ named_struct(
366
+ 'missing_in_source', cast({data_reconcile_output.missing_in_src_count} as bigint),
367
+ 'missing_in_target', cast({data_reconcile_output.missing_in_tgt_count} as bigint)
368
+ ) else null end,
369
+ 'column_comparison', case when '{self.report_type.lower()}' in ('all', 'data')
370
+ and '{exception_msg}' = '' then
371
+ named_struct(
372
+ 'absolute_mismatch', cast({data_reconcile_output.mismatch_count} as bigint),
373
+ 'threshold_mismatch', cast({data_reconcile_output.threshold_output.threshold_mismatch_count} as bigint),
374
+ 'mismatch_columns', '{",".join(mismatch_columns)}'
375
+ ) else null end,
376
+ 'schema_comparison', case when '{self.report_type.lower()}' in ('all', 'schema')
377
+ and '{exception_msg}' = '' then
378
+ {schema_reconcile_output.is_valid} else null end
379
+ ) as recon_metrics,
380
+ named_struct(
381
+ 'status', {status},
382
+ 'run_by_user', '{self.ws.current_user.me().user_name}',
383
+ 'exception_message', "{exception_msg}"
384
+ ) as run_metrics,
385
+ cast('{insertion_time}' as timestamp) as inserted_ts
386
+ """
387
+ )
388
+ _write_df_to_delta(df, f"{self._db_prefix}.{_RECON_METRICS_TABLE_NAME}")
389
+
390
+ @classmethod
391
+ def _create_map_column(
392
+ cls,
393
+ recon_table_id: int,
394
+ df: DataFrame,
395
+ recon_type: str,
396
+ status: bool,
397
+ ) -> DataFrame:
398
+ columns = df.columns
399
+ # Create a list of column names and their corresponding column values
400
+ map_args = []
401
+ for column in columns:
402
+ map_args.extend([lit(column).alias(column + "_key"), col(column).cast("string").alias(column + "_value")])
403
+ # Create a new DataFrame with a map column
404
+ df = df.select(create_map(*map_args).alias("data"))
405
+ df = (
406
+ df.withColumn("recon_table_id", lit(recon_table_id))
407
+ .withColumn("recon_type", lit(recon_type))
408
+ .withColumn("status", lit(status))
409
+ .withColumn("inserted_ts", lit(datetime.now()))
410
+ )
411
+ return (
412
+ df.groupBy("recon_table_id", "recon_type", "status", "inserted_ts")
413
+ .agg(collect_list("data").alias("data"))
414
+ .selectExpr("recon_table_id", "recon_type", "status", "data", "inserted_ts")
415
+ )
416
+
417
+ def _create_map_column_and_insert(
418
+ self,
419
+ recon_table_id: int,
420
+ df: DataFrame,
421
+ recon_type: str,
422
+ status: bool,
423
+ ) -> None:
424
+ df = self._create_map_column(recon_table_id, df, recon_type, status)
425
+ _write_df_to_delta(df, f"{self._db_prefix}.{_RECON_DETAILS_TABLE_NAME}")
426
+
427
+ def _insert_into_details_table(
428
+ self,
429
+ recon_table_id: int,
430
+ reconcile_output: DataReconcileOutput,
431
+ schema_output: SchemaReconcileOutput,
432
+ ):
433
+ if reconcile_output.mismatch_count > 0 and reconcile_output.mismatch.mismatch_df:
434
+ self._create_map_column_and_insert(
435
+ recon_table_id,
436
+ reconcile_output.mismatch.mismatch_df,
437
+ "mismatch",
438
+ False,
439
+ )
440
+
441
+ if reconcile_output.missing_in_src_count > 0 and reconcile_output.missing_in_src:
442
+ self._create_map_column_and_insert(
443
+ recon_table_id,
444
+ reconcile_output.missing_in_src,
445
+ "missing_in_source",
446
+ False,
447
+ )
448
+
449
+ if reconcile_output.missing_in_tgt_count > 0 and reconcile_output.missing_in_tgt:
450
+ self._create_map_column_and_insert(
451
+ recon_table_id,
452
+ reconcile_output.missing_in_tgt,
453
+ "missing_in_target",
454
+ False,
455
+ )
456
+
457
+ if (
458
+ reconcile_output.threshold_output.threshold_mismatch_count > 0
459
+ and reconcile_output.threshold_output.threshold_df
460
+ ):
461
+ self._create_map_column_and_insert(
462
+ recon_table_id,
463
+ reconcile_output.threshold_output.threshold_df,
464
+ "threshold_mismatch",
465
+ False,
466
+ )
467
+
468
+ if schema_output.compare_df is not None:
469
+ self._create_map_column_and_insert(
470
+ recon_table_id, schema_output.compare_df, "schema", schema_output.is_valid
471
+ )
472
+
473
+ def _get_df(
474
+ self,
475
+ recon_table_id: int,
476
+ agg_data: DataReconcileOutput,
477
+ recon_type: str,
478
+ ):
479
+
480
+ column_count = agg_data.mismatch_count
481
+ agg_df = agg_data.mismatch.mismatch_df
482
+ match recon_type:
483
+ case "missing_in_source":
484
+ column_count = agg_data.missing_in_src_count
485
+ agg_df = agg_data.missing_in_src
486
+ case "missing_in_target":
487
+ column_count = agg_data.missing_in_tgt_count
488
+ agg_df = agg_data.missing_in_tgt
489
+
490
+ if column_count > 0 and agg_df:
491
+ return self._create_map_column(
492
+ recon_table_id,
493
+ agg_df,
494
+ recon_type,
495
+ False,
496
+ )
497
+ return None
498
+
499
+ @classmethod
500
+ def _union_dataframes(cls, df_list: list[DataFrame]) -> DataFrame:
501
+ return reduce(lambda agg_df, df: agg_df.unionByName(df), df_list)
502
+
503
+ def _insert_aggregates_into_metrics_table(
504
+ self,
505
+ recon_table_id: int,
506
+ reconcile_agg_output_list: list[AggregateQueryOutput],
507
+ ) -> None:
508
+
509
+ agg_metrics_df_list = []
510
+ for agg_output in reconcile_agg_output_list:
511
+ agg_data = agg_output.reconcile_output
512
+
513
+ status = False
514
+ if agg_data.exception in {None, ''}:
515
+ status = not (
516
+ agg_data.mismatch_count > 0
517
+ or agg_data.missing_in_src_count > 0
518
+ or agg_data.missing_in_tgt_count > 0
519
+ )
520
+
521
+ exception_msg = ""
522
+ if agg_data.exception is not None:
523
+ exception_msg = agg_data.exception.replace("'", '').replace('"', '')
524
+
525
+ insertion_time = str(datetime.now())
526
+
527
+ # If there is any exception while running the Query,
528
+ # each rule is stored, with the Exception message in the metrics table
529
+ assert agg_output.rule, "Aggregate Rule must be present for storing the metrics"
530
+ rule_id = hash(f"{recon_table_id}_{agg_output.rule.column_from_rule}")
531
+
532
+ agg_metrics_df = self.spark.sql(
533
+ f"""
534
+ select {recon_table_id} as recon_table_id,
535
+ {rule_id} as rule_id,
536
+ if('{exception_msg}' = '', named_struct(
537
+ 'missing_in_source', {agg_data.missing_in_src_count},
538
+ 'missing_in_target', {agg_data.missing_in_tgt_count},
539
+ 'mismatch', {agg_data.mismatch_count}
540
+ ), null) as recon_metrics,
541
+ named_struct(
542
+ 'status', {status},
543
+ 'run_by_user', '{self.ws.current_user.me().user_name}',
544
+ 'exception_message', "{exception_msg}"
545
+ ) as run_metrics,
546
+ cast('{insertion_time}' as timestamp) as inserted_ts
547
+ """
548
+ )
549
+ agg_metrics_df_list.append(agg_metrics_df)
550
+
551
+ agg_metrics_table_df = self._union_dataframes(agg_metrics_df_list)
552
+ _write_df_to_delta(agg_metrics_table_df, f"{self._db_prefix}.{_RECON_AGGREGATE_METRICS_TABLE_NAME}")
553
+
554
+ def _insert_aggregates_into_details_table(
555
+ self, recon_table_id: int, reconcile_agg_output_list: list[AggregateQueryOutput]
556
+ ):
557
+ agg_details_df_list = []
558
+ for agg_output in reconcile_agg_output_list:
559
+ agg_details_rule_df_list = []
560
+
561
+ mismatch_df = self._get_df(recon_table_id, agg_output.reconcile_output, "mismatch")
562
+ if mismatch_df and not mismatch_df.isEmpty():
563
+ agg_details_rule_df_list.append(mismatch_df)
564
+
565
+ missing_src_df = self._get_df(recon_table_id, agg_output.reconcile_output, "missing_in_source")
566
+ if missing_src_df and not missing_src_df.isEmpty():
567
+ agg_details_rule_df_list.append(missing_src_df)
568
+
569
+ missing_tgt_df = self._get_df(recon_table_id, agg_output.reconcile_output, "missing_in_target")
570
+ if missing_tgt_df and not missing_tgt_df.isEmpty():
571
+ agg_details_rule_df_list.append(missing_tgt_df)
572
+
573
+ if agg_details_rule_df_list:
574
+ agg_details_rule_df = self._union_dataframes(agg_details_rule_df_list)
575
+ if agg_output.rule:
576
+ rule_id = hash(f"{recon_table_id}_{agg_output.rule.column_from_rule}")
577
+ agg_details_rule_df = agg_details_rule_df.withColumn("rule_id", lit(rule_id)).select(
578
+ "recon_table_id", "rule_id", "recon_type", "data", "inserted_ts"
579
+ )
580
+ agg_details_df_list.append(agg_details_rule_df)
581
+ else:
582
+ logger.warning("Aggregate Details Rules are empty")
583
+
584
+ if agg_details_df_list:
585
+ agg_details_table_df = self._union_dataframes(agg_details_df_list)
586
+ _write_df_to_delta(agg_details_table_df, f"{self._db_prefix}.{_RECON_AGGREGATE_DETAILS_TABLE_NAME}")
587
+
588
+ def start(
589
+ self,
590
+ data_reconcile_output: DataReconcileOutput,
591
+ schema_reconcile_output: SchemaReconcileOutput,
592
+ table_conf: Table,
593
+ recon_process_duration: ReconcileProcessDuration,
594
+ record_count: ReconcileRecordCount,
595
+ ) -> None:
596
+ recon_table_id = self._generate_recon_main_id(table_conf)
597
+ self._insert_into_main_table(recon_table_id, table_conf, recon_process_duration)
598
+ self._insert_into_metrics_table(
599
+ recon_table_id, data_reconcile_output, schema_reconcile_output, table_conf, record_count
600
+ )
601
+ self._insert_into_details_table(recon_table_id, data_reconcile_output, schema_reconcile_output)
602
+
603
+ def store_aggregates_metrics(
604
+ self,
605
+ table_conf: Table,
606
+ recon_process_duration: ReconcileProcessDuration,
607
+ reconcile_agg_output_list: list[AggregateQueryOutput],
608
+ ) -> None:
609
+ recon_table_id = self._generate_recon_main_id(table_conf)
610
+ self._insert_into_main_table(recon_table_id, table_conf, recon_process_duration, 'aggregates-reconcile')
611
+ self._insert_into_rules_table(recon_table_id, reconcile_agg_output_list)
612
+ self._insert_aggregates_into_metrics_table(recon_table_id, reconcile_agg_output_list)
613
+ self._insert_aggregates_into_details_table(
614
+ recon_table_id,
615
+ reconcile_agg_output_list,
616
+ )
617
+
618
+ def _insert_into_rules_table(self, recon_table_id: int, reconcile_agg_output_list: list[AggregateQueryOutput]):
619
+
620
+ rule_df_list = []
621
+ for agg_output in reconcile_agg_output_list:
622
+ if not agg_output.rule:
623
+ logger.error("Aggregate Rule must be present for storing the rules")
624
+ continue
625
+ rule_id = hash(f"{recon_table_id}_{agg_output.rule.column_from_rule}")
626
+ rule_query = agg_output.rule.get_rule_query(rule_id)
627
+ rule_df_list.append(
628
+ self.spark.sql(rule_query)
629
+ .withColumn("inserted_ts", lit(datetime.now()))
630
+ .select("rule_id", "rule_type", "rule_info", "inserted_ts")
631
+ )
632
+
633
+ if rule_df_list:
634
+ rules_table_df = self._union_dataframes(rule_df_list)
635
+ _write_df_to_delta(rules_table_df, f"{self._db_prefix}.{_RECON_AGGREGATE_RULES_TABLE_NAME}")