databricks-labs-lakebridge 0.10.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (171) hide show
  1. databricks/__init__.py +3 -0
  2. databricks/labs/__init__.py +3 -0
  3. databricks/labs/lakebridge/__about__.py +2 -0
  4. databricks/labs/lakebridge/__init__.py +11 -0
  5. databricks/labs/lakebridge/assessments/configure_assessment.py +194 -0
  6. databricks/labs/lakebridge/assessments/pipeline.py +188 -0
  7. databricks/labs/lakebridge/assessments/profiler_config.py +30 -0
  8. databricks/labs/lakebridge/base_install.py +12 -0
  9. databricks/labs/lakebridge/cli.py +449 -0
  10. databricks/labs/lakebridge/config.py +192 -0
  11. databricks/labs/lakebridge/connections/__init__.py +0 -0
  12. databricks/labs/lakebridge/connections/credential_manager.py +89 -0
  13. databricks/labs/lakebridge/connections/database_manager.py +98 -0
  14. databricks/labs/lakebridge/connections/env_getter.py +13 -0
  15. databricks/labs/lakebridge/contexts/__init__.py +0 -0
  16. databricks/labs/lakebridge/contexts/application.py +133 -0
  17. databricks/labs/lakebridge/coverage/__init__.py +0 -0
  18. databricks/labs/lakebridge/coverage/commons.py +223 -0
  19. databricks/labs/lakebridge/coverage/lakebridge_snow_transpilation_coverage.py +29 -0
  20. databricks/labs/lakebridge/coverage/local_report.py +9 -0
  21. databricks/labs/lakebridge/coverage/sqlglot_snow_transpilation_coverage.py +5 -0
  22. databricks/labs/lakebridge/coverage/sqlglot_tsql_transpilation_coverage.py +5 -0
  23. databricks/labs/lakebridge/deployment/__init__.py +0 -0
  24. databricks/labs/lakebridge/deployment/configurator.py +199 -0
  25. databricks/labs/lakebridge/deployment/dashboard.py +140 -0
  26. databricks/labs/lakebridge/deployment/installation.py +125 -0
  27. databricks/labs/lakebridge/deployment/job.py +147 -0
  28. databricks/labs/lakebridge/deployment/recon.py +145 -0
  29. databricks/labs/lakebridge/deployment/table.py +30 -0
  30. databricks/labs/lakebridge/deployment/upgrade_common.py +124 -0
  31. databricks/labs/lakebridge/discovery/table.py +36 -0
  32. databricks/labs/lakebridge/discovery/table_definition.py +23 -0
  33. databricks/labs/lakebridge/discovery/tsql_table_definition.py +185 -0
  34. databricks/labs/lakebridge/errors/exceptions.py +1 -0
  35. databricks/labs/lakebridge/helpers/__init__.py +0 -0
  36. databricks/labs/lakebridge/helpers/db_sql.py +24 -0
  37. databricks/labs/lakebridge/helpers/execution_time.py +20 -0
  38. databricks/labs/lakebridge/helpers/file_utils.py +64 -0
  39. databricks/labs/lakebridge/helpers/metastore.py +164 -0
  40. databricks/labs/lakebridge/helpers/recon_config_utils.py +176 -0
  41. databricks/labs/lakebridge/helpers/string_utils.py +62 -0
  42. databricks/labs/lakebridge/helpers/telemetry_utils.py +13 -0
  43. databricks/labs/lakebridge/helpers/validation.py +101 -0
  44. databricks/labs/lakebridge/install.py +849 -0
  45. databricks/labs/lakebridge/intermediate/__init__.py +0 -0
  46. databricks/labs/lakebridge/intermediate/dag.py +88 -0
  47. databricks/labs/lakebridge/intermediate/engine_adapter.py +0 -0
  48. databricks/labs/lakebridge/intermediate/root_tables.py +44 -0
  49. databricks/labs/lakebridge/jvmproxy.py +56 -0
  50. databricks/labs/lakebridge/lineage.py +42 -0
  51. databricks/labs/lakebridge/reconcile/__init__.py +0 -0
  52. databricks/labs/lakebridge/reconcile/compare.py +414 -0
  53. databricks/labs/lakebridge/reconcile/connectors/__init__.py +0 -0
  54. databricks/labs/lakebridge/reconcile/connectors/data_source.py +72 -0
  55. databricks/labs/lakebridge/reconcile/connectors/databricks.py +87 -0
  56. databricks/labs/lakebridge/reconcile/connectors/jdbc_reader.py +41 -0
  57. databricks/labs/lakebridge/reconcile/connectors/oracle.py +108 -0
  58. databricks/labs/lakebridge/reconcile/connectors/secrets.py +30 -0
  59. databricks/labs/lakebridge/reconcile/connectors/snowflake.py +173 -0
  60. databricks/labs/lakebridge/reconcile/connectors/source_adapter.py +30 -0
  61. databricks/labs/lakebridge/reconcile/connectors/sql_server.py +132 -0
  62. databricks/labs/lakebridge/reconcile/constants.py +37 -0
  63. databricks/labs/lakebridge/reconcile/exception.py +42 -0
  64. databricks/labs/lakebridge/reconcile/execute.py +920 -0
  65. databricks/labs/lakebridge/reconcile/query_builder/__init__.py +0 -0
  66. databricks/labs/lakebridge/reconcile/query_builder/aggregate_query.py +293 -0
  67. databricks/labs/lakebridge/reconcile/query_builder/base.py +138 -0
  68. databricks/labs/lakebridge/reconcile/query_builder/count_query.py +33 -0
  69. databricks/labs/lakebridge/reconcile/query_builder/expression_generator.py +292 -0
  70. databricks/labs/lakebridge/reconcile/query_builder/hash_query.py +91 -0
  71. databricks/labs/lakebridge/reconcile/query_builder/sampling_query.py +123 -0
  72. databricks/labs/lakebridge/reconcile/query_builder/threshold_query.py +231 -0
  73. databricks/labs/lakebridge/reconcile/recon_capture.py +635 -0
  74. databricks/labs/lakebridge/reconcile/recon_config.py +363 -0
  75. databricks/labs/lakebridge/reconcile/recon_output_config.py +85 -0
  76. databricks/labs/lakebridge/reconcile/runner.py +97 -0
  77. databricks/labs/lakebridge/reconcile/sampler.py +239 -0
  78. databricks/labs/lakebridge/reconcile/schema_compare.py +126 -0
  79. databricks/labs/lakebridge/resources/__init__.py +0 -0
  80. databricks/labs/lakebridge/resources/config/credentials.yml +33 -0
  81. databricks/labs/lakebridge/resources/reconcile/__init__.py +0 -0
  82. databricks/labs/lakebridge/resources/reconcile/dashboards/__init__.py +0 -0
  83. databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/00_0_aggregate_recon_header.md +6 -0
  84. databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/01_0_recon_id.filter.yml +6 -0
  85. databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/01_1_executed_by.filter.yml +5 -0
  86. databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/01_2_started_at.filter.yml +5 -0
  87. databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/02_0_source_type.filter.yml +5 -0
  88. databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/02_1_source_table.filter.yml +5 -0
  89. databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/02_2_target_table.filter.yml +5 -0
  90. databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/04_0_aggregate_summary_table.sql +46 -0
  91. databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/05_0_aggregate_recon_drilldown_header.md +2 -0
  92. databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/06_0_recon_id.filter.yml +5 -0
  93. databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/06_1_category.filter.yml +5 -0
  94. databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/06_2_aggregate_type.filter.yml +5 -0
  95. databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/07_0_target_table.filter.yml +4 -0
  96. databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/07_1_source_table.filter.yml +4 -0
  97. databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/08_0_aggregate_details_table.sql +92 -0
  98. databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/09_0_aggregate_missing_mismatch_header.md +1 -0
  99. databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/10_0_aggr_mismatched_records.sql +19 -0
  100. databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/11_0_aggr_missing_in_databricks.sql +19 -0
  101. databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/11_1_aggr_missing_in_source.sql +19 -0
  102. databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/dashboard.yml +365 -0
  103. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/00_0_recon_main.md +3 -0
  104. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/01_0_recon_id.filter.yml +6 -0
  105. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/01_1_report_type.filter.yml +5 -0
  106. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/01_2_executed_by.filter.yml +5 -0
  107. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/02_0_source_type.filter.yml +5 -0
  108. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/02_1_source_table.filter.yml +6 -0
  109. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/02_2_target_table.filter.yml +6 -0
  110. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/03_0_started_at.filter.yml +5 -0
  111. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/05_0_summary_table.sql +38 -0
  112. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/06_0_schema_comparison_header.md +3 -0
  113. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/07_0_schema_details_table.sql +42 -0
  114. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/08_0_drill_down_header.md +3 -0
  115. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/09_0_recon_id.filter.yml +4 -0
  116. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/09_1_category.filter.yml +4 -0
  117. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/10_0_target_table.filter.yml +4 -0
  118. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/10_1_source_table.filter.yml +4 -0
  119. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/11_0_recon_details_pivot.sql +40 -0
  120. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/12_0_daily_data_validation_issue_header.md +3 -0
  121. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/13_0_success_fail_.filter.yml +4 -0
  122. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/14_0_failed_recon_ids.sql +15 -0
  123. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/15_0_total_failed_runs.sql +10 -0
  124. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/15_1_failed_targets.sql +10 -0
  125. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/15_2_successful_targets.sql +10 -0
  126. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/16_0_missing_mismatch_header.md +1 -0
  127. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/17_0_mismatched_records.sql +14 -0
  128. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/17_1_threshold_mismatches.sql +14 -0
  129. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/18_0_missing_in_databricks.sql +14 -0
  130. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/18_1_missing_in_source.sql +14 -0
  131. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/dashboard.yml +545 -0
  132. databricks/labs/lakebridge/resources/reconcile/queries/__init__.py +0 -0
  133. databricks/labs/lakebridge/resources/reconcile/queries/installation/__init__.py +0 -0
  134. databricks/labs/lakebridge/resources/reconcile/queries/installation/aggregate_details.sql +7 -0
  135. databricks/labs/lakebridge/resources/reconcile/queries/installation/aggregate_metrics.sql +15 -0
  136. databricks/labs/lakebridge/resources/reconcile/queries/installation/aggregate_rules.sql +6 -0
  137. databricks/labs/lakebridge/resources/reconcile/queries/installation/details.sql +7 -0
  138. databricks/labs/lakebridge/resources/reconcile/queries/installation/main.sql +24 -0
  139. databricks/labs/lakebridge/resources/reconcile/queries/installation/metrics.sql +21 -0
  140. databricks/labs/lakebridge/transpiler/__init__.py +0 -0
  141. databricks/labs/lakebridge/transpiler/execute.py +423 -0
  142. databricks/labs/lakebridge/transpiler/lsp/__init__.py +0 -0
  143. databricks/labs/lakebridge/transpiler/lsp/lsp_engine.py +564 -0
  144. databricks/labs/lakebridge/transpiler/sqlglot/__init__.py +0 -0
  145. databricks/labs/lakebridge/transpiler/sqlglot/dialect_utils.py +30 -0
  146. databricks/labs/lakebridge/transpiler/sqlglot/generator/__init__.py +0 -0
  147. databricks/labs/lakebridge/transpiler/sqlglot/generator/databricks.py +771 -0
  148. databricks/labs/lakebridge/transpiler/sqlglot/lca_utils.py +138 -0
  149. databricks/labs/lakebridge/transpiler/sqlglot/local_expression.py +197 -0
  150. databricks/labs/lakebridge/transpiler/sqlglot/parsers/__init__.py +0 -0
  151. databricks/labs/lakebridge/transpiler/sqlglot/parsers/oracle.py +23 -0
  152. databricks/labs/lakebridge/transpiler/sqlglot/parsers/presto.py +202 -0
  153. databricks/labs/lakebridge/transpiler/sqlglot/parsers/snowflake.py +535 -0
  154. databricks/labs/lakebridge/transpiler/sqlglot/sqlglot_engine.py +203 -0
  155. databricks/labs/lakebridge/transpiler/transpile_engine.py +49 -0
  156. databricks/labs/lakebridge/transpiler/transpile_status.py +68 -0
  157. databricks/labs/lakebridge/uninstall.py +28 -0
  158. databricks/labs/lakebridge/upgrades/v0.4.0_add_main_table_operation_name_column.py +80 -0
  159. databricks/labs/lakebridge/upgrades/v0.6.0_alter_metrics_datatype.py +51 -0
  160. databricks_labs_lakebridge-0.10.0.dist-info/METADATA +58 -0
  161. databricks_labs_lakebridge-0.10.0.dist-info/RECORD +171 -0
  162. databricks_labs_lakebridge-0.10.0.dist-info/WHEEL +4 -0
  163. databricks_labs_lakebridge-0.10.0.dist-info/entry_points.txt +2 -0
  164. databricks_labs_lakebridge-0.10.0.dist-info/licenses/LICENSE +69 -0
  165. databricks_labs_lakebridge-0.10.0.dist-info/licenses/NOTICE +42 -0
  166. docs/lakebridge/src/components/Button.tsx +81 -0
  167. docs/lakebridge/src/css/custom.css +167 -0
  168. docs/lakebridge/src/css/table.css +20 -0
  169. docs/lakebridge/src/pages/index.tsx +57 -0
  170. docs/lakebridge/src/theme/Footer/index.tsx +24 -0
  171. docs/lakebridge/src/theme/Layout/index.tsx +18 -0
@@ -0,0 +1,363 @@
1
+ from __future__ import annotations
2
+
3
+ import logging
4
+
5
+ from dataclasses import dataclass
6
+ from collections.abc import Callable
7
+
8
+ from sqlglot import expressions as exp
9
+
10
+ from databricks.labs.lakebridge.reconcile.constants import SamplingOptionMethod, SamplingSpecificationsType
11
+
12
+ logger = logging.getLogger(__name__)
13
+
14
+ _SUPPORTED_AGG_TYPES: set[str] = {
15
+ "min",
16
+ "max",
17
+ "count",
18
+ "sum",
19
+ "avg",
20
+ "mean",
21
+ "mode",
22
+ "stddev",
23
+ "variance",
24
+ "median",
25
+ }
26
+
27
+ RECONCILE_OPERATION_NAME = "reconcile"
28
+ AGG_RECONCILE_OPERATION_NAME = "aggregates-reconcile"
29
+
30
+
31
+ class TableThresholdBoundsException(ValueError):
32
+ """Raise the error when the bounds for table threshold are invalid"""
33
+
34
+
35
+ class InvalidModelForTableThreshold(ValueError):
36
+ """Raise the error when the model for table threshold is invalid"""
37
+
38
+
39
+ @dataclass
40
+ class HashAlgoMapping:
41
+ source: Callable
42
+ target: Callable
43
+
44
+
45
+ @dataclass
46
+ class SamplingSpecifications:
47
+ type: SamplingSpecificationsType
48
+ value: float
49
+
50
+ def __post_init__(self):
51
+ if not isinstance(self.type, SamplingSpecificationsType):
52
+ self.type = SamplingSpecificationsType(str(self.type).lower())
53
+ # Disabled
54
+ if self.type == SamplingSpecificationsType.FRACTION:
55
+ raise ValueError("SamplingSpecifications: 'FRACTION' type is disabled")
56
+ if self.type == SamplingSpecificationsType.FRACTION and (self.value is None or (not 0 < self.value < 1)):
57
+ raise ValueError("SamplingSpecifications: Fraction value must be greater than 0 and less than 1")
58
+
59
+
60
+ @dataclass
61
+ class SamplingOptions:
62
+ method: SamplingOptionMethod
63
+ specifications: SamplingSpecifications
64
+ stratified_columns: list[str] | None = None
65
+ stratified_buckets: int | None = None
66
+
67
+ def __post_init__(self):
68
+ if not isinstance(self.method, SamplingOptionMethod):
69
+ self.method = SamplingOptionMethod(str(self.method).lower())
70
+
71
+ if self.stratified_columns:
72
+ self.stratified_columns = [col.lower() for col in self.stratified_columns]
73
+
74
+ if self.method == SamplingOptionMethod.STRATIFIED:
75
+ if not self.stratified_columns or not self.stratified_buckets:
76
+ raise ValueError(
77
+ "SamplingOptions : stratified_columns and stratified_buckets are required for STRATIFIED method"
78
+ )
79
+
80
+
81
+ @dataclass
82
+ class JdbcReaderOptions:
83
+ number_partitions: int | None = None
84
+ partition_column: str | None = None
85
+ lower_bound: str | None = None
86
+ upper_bound: str | None = None
87
+ fetch_size: int = 100
88
+
89
+ def __post_init__(self):
90
+ self.partition_column = self.partition_column.lower() if self.partition_column else None
91
+
92
+
93
+ @dataclass
94
+ class ColumnMapping:
95
+ source_name: str
96
+ target_name: str
97
+
98
+ def __post_init__(self):
99
+ self.source_name = self.source_name.lower()
100
+ self.target_name = self.target_name.lower()
101
+
102
+
103
+ @dataclass
104
+ class Transformation:
105
+ column_name: str
106
+ source: str | None = None
107
+ target: str | None = None
108
+
109
+ def __post_init__(self):
110
+ self.column_name = self.column_name.lower()
111
+
112
+
113
+ @dataclass
114
+ class ColumnThresholds:
115
+ column_name: str
116
+ lower_bound: str
117
+ upper_bound: str
118
+ type: str
119
+
120
+ def __post_init__(self):
121
+ self.column_name = self.column_name.lower()
122
+ self.type = self.type.lower()
123
+
124
+ def get_mode(self):
125
+ return "percentage" if "%" in self.lower_bound or "%" in self.upper_bound else "absolute"
126
+
127
+ def get_type(self):
128
+ if any(self.type in numeric_type.value.lower() for numeric_type in exp.DataType.NUMERIC_TYPES):
129
+ if self.get_mode() == "absolute":
130
+ return "number_absolute"
131
+ return "number_percentage"
132
+
133
+ if any(self.type in numeric_type.value.lower() for numeric_type in exp.DataType.TEMPORAL_TYPES):
134
+ return "datetime"
135
+ return None
136
+
137
+
138
+ @dataclass
139
+ class TableThresholds:
140
+ lower_bound: str
141
+ upper_bound: str
142
+ model: str
143
+
144
+ def __post_init__(self):
145
+ self.model = self.model.lower()
146
+ self.validate_threshold_bounds()
147
+ self.validate_threshold_model()
148
+
149
+ def get_mode(self):
150
+ return "percentage" if "%" in self.lower_bound or "%" in self.upper_bound else "absolute"
151
+
152
+ def validate_threshold_bounds(self):
153
+ lower_bound = int(self.lower_bound.replace("%", ""))
154
+ upper_bound = int(self.upper_bound.replace("%", ""))
155
+ if lower_bound < 0 or upper_bound < 0:
156
+ raise TableThresholdBoundsException("Threshold bounds for table cannot be negative.")
157
+ if lower_bound > upper_bound:
158
+ raise TableThresholdBoundsException("Lower bound cannot be greater than upper bound.")
159
+
160
+ def validate_threshold_model(self):
161
+ if self.model not in ["mismatch"]:
162
+ raise InvalidModelForTableThreshold(
163
+ f"Invalid model for Table Threshold: expected 'mismatch', but got '{self.model}'."
164
+ )
165
+
166
+
167
+ @dataclass
168
+ class Filters:
169
+ source: str | None = None
170
+ target: str | None = None
171
+
172
+
173
+ def to_lower_case(input_list: list[str]) -> list[str]:
174
+ return [element.lower() for element in input_list]
175
+
176
+
177
+ @dataclass
178
+ class Table:
179
+ source_name: str
180
+ target_name: str
181
+ sampling_options: SamplingOptions | None = None
182
+ aggregates: list[Aggregate] | None = None
183
+ join_columns: list[str] | None = None
184
+ jdbc_reader_options: JdbcReaderOptions | None = None
185
+ select_columns: list[str] | None = None
186
+ drop_columns: list[str] | None = None
187
+ column_mapping: list[ColumnMapping] | None = None
188
+ transformations: list[Transformation] | None = None
189
+ column_thresholds: list[ColumnThresholds] | None = None
190
+ filters: Filters | None = None
191
+ table_thresholds: list[TableThresholds] | None = None
192
+
193
+ def __post_init__(self):
194
+ self.source_name = self.source_name.lower()
195
+ self.target_name = self.target_name.lower()
196
+ self.select_columns = to_lower_case(self.select_columns) if self.select_columns else None
197
+ self.drop_columns = to_lower_case(self.drop_columns) if self.drop_columns else None
198
+ self.join_columns = to_lower_case(self.join_columns) if self.join_columns else None
199
+
200
+ @property
201
+ def to_src_col_map(self):
202
+ if self.column_mapping:
203
+ return {c.source_name: c.target_name for c in self.column_mapping}
204
+ return None
205
+
206
+ @property
207
+ def to_tgt_col_map(self):
208
+ if self.column_mapping:
209
+ return {c.target_name: c.source_name for c in self.column_mapping}
210
+ return None
211
+
212
+ def get_src_to_tgt_col_mapping_list(self, cols: list[str], layer: str) -> set[str]:
213
+ if layer == "source":
214
+ return set(cols)
215
+ if self.to_src_col_map:
216
+ return {self.to_src_col_map.get(col, col) for col in cols}
217
+ return set(cols)
218
+
219
+ def get_layer_src_to_tgt_col_mapping(self, column_name: str, layer: str) -> str:
220
+ if layer == "source":
221
+ return column_name
222
+ if self.to_src_col_map:
223
+ return self.to_src_col_map.get(column_name, column_name)
224
+ return column_name
225
+
226
+ def get_tgt_to_src_col_mapping_list(self, cols: list[str] | set[str]) -> set[str]:
227
+ if self.to_tgt_col_map:
228
+ return {self.to_tgt_col_map.get(col, col) for col in cols}
229
+ return set(cols)
230
+
231
+ def get_layer_tgt_to_src_col_mapping(self, column_name: str, layer: str) -> str:
232
+ if layer == "source":
233
+ return column_name
234
+ if self.to_tgt_col_map:
235
+ return self.to_tgt_col_map.get(column_name, column_name)
236
+ return column_name
237
+
238
+ def get_select_columns(self, schema: list[Schema], layer: str) -> set[str]:
239
+ if self.select_columns is None:
240
+ return {sch.column_name for sch in schema}
241
+ if self.to_src_col_map:
242
+ return self.get_src_to_tgt_col_mapping_list(self.select_columns, layer)
243
+ return set(self.select_columns)
244
+
245
+ def get_threshold_columns(self, layer: str) -> set[str]:
246
+ if self.column_thresholds is None:
247
+ return set()
248
+ return {self.get_layer_src_to_tgt_col_mapping(thresh.column_name, layer) for thresh in self.column_thresholds}
249
+
250
+ def get_join_columns(self, layer: str) -> set[str] | None:
251
+ if self.join_columns is None:
252
+ return None
253
+ return {self.get_layer_src_to_tgt_col_mapping(col, layer) for col in self.join_columns}
254
+
255
+ def get_drop_columns(self, layer: str) -> set[str]:
256
+ if self.drop_columns is None:
257
+ return set()
258
+ return {self.get_layer_src_to_tgt_col_mapping(col, layer) for col in self.drop_columns}
259
+
260
+ def get_transformation_dict(self, layer: str) -> dict[str, str]:
261
+ if self.transformations:
262
+ if layer == "source":
263
+ return {
264
+ trans.column_name: (trans.source if trans.source else trans.column_name)
265
+ for trans in self.transformations
266
+ }
267
+ return {
268
+ self.get_layer_src_to_tgt_col_mapping(trans.column_name, layer): (
269
+ trans.target if trans.target else self.get_layer_src_to_tgt_col_mapping(trans.column_name, layer)
270
+ )
271
+ for trans in self.transformations
272
+ }
273
+ return {}
274
+
275
+ def get_partition_column(self, layer: str) -> set[str]:
276
+ if self.jdbc_reader_options and layer == "source":
277
+ if self.jdbc_reader_options.partition_column:
278
+ return {self.jdbc_reader_options.partition_column}
279
+ return set()
280
+
281
+ def get_filter(self, layer: str) -> str | None:
282
+ if self.filters is None:
283
+ return None
284
+ if layer == "source":
285
+ return self.filters.source
286
+ return self.filters.target
287
+
288
+
289
+ @dataclass
290
+ class Schema:
291
+ column_name: str
292
+ data_type: str
293
+
294
+
295
+ @dataclass
296
+ class Aggregate:
297
+ agg_columns: list[str]
298
+ type: str
299
+ group_by_columns: list[str] | None = None
300
+
301
+ def __post_init__(self):
302
+ self.agg_columns = to_lower_case(self.agg_columns)
303
+ self.type = self.type.lower()
304
+ self.group_by_columns = to_lower_case(self.group_by_columns) if self.group_by_columns else None
305
+ assert (
306
+ self.type in _SUPPORTED_AGG_TYPES
307
+ ), f"Invalid aggregate type: {self.type}, only {_SUPPORTED_AGG_TYPES} are supported."
308
+
309
+ def get_agg_type(self):
310
+ return self.type
311
+
312
+ @classmethod
313
+ def _join_columns(cls, columns: list[str]):
314
+ return "+__+".join(columns)
315
+
316
+ @property
317
+ def group_by_columns_as_str(self):
318
+ return self._join_columns(self.group_by_columns) if self.group_by_columns else "NA"
319
+
320
+ @property
321
+ def agg_columns_as_str(self):
322
+ return self._join_columns(self.agg_columns)
323
+
324
+
325
+ @dataclass
326
+ class AggregateRule:
327
+ agg_type: str
328
+ agg_column: str
329
+ group_by_columns: list[str] | None
330
+ group_by_columns_as_str: str
331
+ rule_type: str = "AGGREGATE"
332
+
333
+ @property
334
+ def column_from_rule(self):
335
+ # creates rule_column. e.g., min_col1_grp1_grp2
336
+ return f"{self.agg_type}_{self.agg_column}_{self.group_by_columns_as_str}"
337
+
338
+ @property
339
+ def group_by_columns_as_table_column(self):
340
+ # If group_by_columns are not defined, store is as null
341
+ group_by_cols_as_table_col = "NULL"
342
+ if self.group_by_columns:
343
+ # Sort the columns, convert to lower case and create a string: , e.g., grp1, grp2
344
+ formatted_cols = ", ".join([f"{col.lower()}" for col in sorted(self.group_by_columns)])
345
+ group_by_cols_as_table_col = f"\"{formatted_cols}\""
346
+ return group_by_cols_as_table_col
347
+
348
+ def get_rule_query(self, rule_id):
349
+ rule_info = f""" map( 'agg_type', '{self.agg_type}',
350
+ 'agg_column', '{self.agg_column}',
351
+ 'group_by_columns', {self.group_by_columns_as_table_column}
352
+ )
353
+ """
354
+ return f" SELECT {rule_id} as rule_id, " f" '{self.rule_type}' as rule_type, " f" {rule_info} as rule_info "
355
+
356
+
357
+ @dataclass
358
+ class AggregateQueryRules:
359
+ layer: str
360
+ group_by_columns: list[str] | None
361
+ group_by_columns_as_str: str
362
+ query: str
363
+ rules: list[AggregateRule]
@@ -0,0 +1,85 @@
1
+ from dataclasses import dataclass, field
2
+
3
+ from pyspark.sql import DataFrame
4
+
5
+ from databricks.labs.lakebridge.reconcile.recon_config import AggregateRule
6
+
7
+
8
+ @dataclass
9
+ class MismatchOutput:
10
+ mismatch_df: DataFrame | None = None
11
+ mismatch_columns: list[str] | None = None
12
+
13
+
14
+ @dataclass
15
+ class ThresholdOutput:
16
+ threshold_df: DataFrame | None = None
17
+ threshold_mismatch_count: int = 0
18
+
19
+
20
+ @dataclass
21
+ class DataReconcileOutput:
22
+ mismatch_count: int = 0
23
+ missing_in_src_count: int = 0
24
+ missing_in_tgt_count: int = 0
25
+ mismatch: MismatchOutput = field(default_factory=MismatchOutput)
26
+ missing_in_src: DataFrame | None = None
27
+ missing_in_tgt: DataFrame | None = None
28
+ threshold_output: ThresholdOutput = field(default_factory=ThresholdOutput)
29
+ exception: str | None = None
30
+
31
+
32
+ @dataclass
33
+ class SchemaMatchResult:
34
+ source_column: str
35
+ source_datatype: str
36
+ databricks_column: str
37
+ databricks_datatype: str
38
+ is_valid: bool = True
39
+
40
+
41
+ @dataclass
42
+ class SchemaReconcileOutput:
43
+ is_valid: bool
44
+ compare_df: DataFrame | None = None
45
+ exception: str | None = None
46
+
47
+
48
+ @dataclass
49
+ class ReconcileProcessDuration:
50
+ start_ts: str
51
+ end_ts: str | None
52
+
53
+
54
+ @dataclass
55
+ class StatusOutput:
56
+ row: bool | None = None
57
+ column: bool | None = None
58
+ schema: bool | None = None
59
+ aggregate: bool | None = None
60
+
61
+
62
+ @dataclass
63
+ class ReconcileTableOutput:
64
+ target_table_name: str
65
+ source_table_name: str
66
+ status: StatusOutput = field(default_factory=StatusOutput)
67
+ exception_message: str | None = None
68
+
69
+
70
+ @dataclass
71
+ class ReconcileOutput:
72
+ recon_id: str
73
+ results: list[ReconcileTableOutput]
74
+
75
+
76
+ @dataclass
77
+ class ReconcileRecordCount:
78
+ source: int = 0
79
+ target: int = 0
80
+
81
+
82
+ @dataclass
83
+ class AggregateQueryOutput:
84
+ rule: AggregateRule | None
85
+ reconcile_output: DataReconcileOutput
@@ -0,0 +1,97 @@
1
+ import logging
2
+ import webbrowser
3
+
4
+ from databricks.labs.blueprint.installation import Installation
5
+ from databricks.labs.blueprint.installation import SerdeError
6
+ from databricks.labs.blueprint.installer import InstallState
7
+ from databricks.labs.blueprint.tui import Prompts
8
+ from databricks.sdk import WorkspaceClient
9
+ from databricks.sdk.errors import NotFound, PermissionDenied
10
+
11
+ from databricks.labs.lakebridge.config import ReconcileConfig, TableRecon
12
+ from databricks.labs.lakebridge.deployment.recon import RECON_JOB_NAME
13
+ from databricks.labs.lakebridge.reconcile.recon_config import RECONCILE_OPERATION_NAME
14
+
15
+ logger = logging.getLogger(__name__)
16
+
17
+ _RECON_README_URL = "https://github.com/databrickslabs/lakebridge/blob/main/docs/recon_configurations/README.md"
18
+
19
+
20
+ class ReconcileRunner:
21
+ def __init__(
22
+ self,
23
+ ws: WorkspaceClient,
24
+ installation: Installation,
25
+ install_state: InstallState,
26
+ prompts: Prompts,
27
+ ):
28
+ self._ws = ws
29
+ self._installation = installation
30
+ self._install_state = install_state
31
+ self._prompts = prompts
32
+
33
+ def run(self, operation_name=RECONCILE_OPERATION_NAME):
34
+ reconcile_config = self._get_verified_recon_config()
35
+ job_id = self._get_recon_job_id(reconcile_config)
36
+ logger.info(f"Triggering the reconcile job with job_id: `{job_id}`")
37
+ wait = self._ws.jobs.run_now(job_id, job_parameters={"operation_name": operation_name})
38
+ if not wait.run_id:
39
+ raise SystemExit(f"Job {job_id} execution failed. Please check the job logs for more details.")
40
+
41
+ job_run_url = f"{self._ws.config.host}/jobs/{job_id}/runs/{wait.run_id}"
42
+ logger.info(
43
+ f"'{operation_name.upper()}' job started. Please check the job_url `{job_run_url}` for the current status."
44
+ )
45
+ if self._prompts.confirm(f"Would you like to open the job run URL `{job_run_url}` in the browser?"):
46
+ webbrowser.open(job_run_url)
47
+
48
+ def _get_verified_recon_config(self) -> ReconcileConfig:
49
+ try:
50
+ recon_config = self._installation.load(ReconcileConfig)
51
+ except NotFound as err:
52
+ raise SystemExit("Cannot find existing `reconcile` installation. Please try reinstalling.") from err
53
+ except (PermissionDenied, SerdeError, ValueError, AttributeError) as e:
54
+ install_dir = self._installation.install_folder()
55
+ raise SystemExit(
56
+ f"Existing `reconcile` installation at {install_dir} is corrupted. Please try reinstalling."
57
+ ) from e
58
+
59
+ self._verify_recon_table_config(recon_config)
60
+ return recon_config
61
+
62
+ def _verify_recon_table_config(self, recon_config):
63
+ source_catalog_or_schema = (
64
+ recon_config.database_config.source_catalog
65
+ if recon_config.database_config.source_catalog
66
+ else recon_config.database_config.source_schema
67
+ )
68
+ # Filename pattern for recon table config `recon_config_<SOURCE>_<CATALOG_OR_SCHEMA>_<FILTER_TYPE>.json`
69
+ # Example: recon_config_snowflake_sample_data_all.json
70
+ filename = f"recon_config_{recon_config.data_source}_{source_catalog_or_schema}_{recon_config.report_type}.json"
71
+ try:
72
+ logger.debug(f"Loading recon table config `{filename}` from workspace.")
73
+ self._installation.load(TableRecon, filename=filename)
74
+ except NotFound as e:
75
+ err_msg = (
76
+ "Cannot find recon table configuration in existing `reconcile` installation. "
77
+ f"Please provide the configuration file {filename} in the workspace."
78
+ )
79
+ logger.error(f"{err_msg}. For more details, please refer to {_RECON_README_URL}")
80
+ raise SystemExit(err_msg) from e
81
+ except (PermissionDenied, SerdeError, ValueError, AttributeError) as e:
82
+ install_dir = self._installation.install_folder()
83
+ err_msg = (
84
+ f"Cannot load corrupted recon table configuration from {install_dir}/{filename}. "
85
+ f"Please validate the file."
86
+ )
87
+ logger.error(f"{err_msg}. For more details, please refer to {_RECON_README_URL}")
88
+ raise SystemExit(err_msg) from e
89
+
90
+ def _get_recon_job_id(self, reconcile_config: ReconcileConfig) -> int:
91
+ if reconcile_config.job_id:
92
+ logger.debug("Reconcile job id found in the reconcile config.")
93
+ return int(reconcile_config.job_id)
94
+ if RECON_JOB_NAME in self._install_state.jobs:
95
+ logger.debug("Reconcile job id found in the install state.")
96
+ return int(self._install_state.jobs[RECON_JOB_NAME])
97
+ raise SystemExit("Reconcile Job ID not found. Please try reinstalling.")