databricks-labs-lakebridge 0.10.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (171) hide show
  1. databricks/__init__.py +3 -0
  2. databricks/labs/__init__.py +3 -0
  3. databricks/labs/lakebridge/__about__.py +2 -0
  4. databricks/labs/lakebridge/__init__.py +11 -0
  5. databricks/labs/lakebridge/assessments/configure_assessment.py +194 -0
  6. databricks/labs/lakebridge/assessments/pipeline.py +188 -0
  7. databricks/labs/lakebridge/assessments/profiler_config.py +30 -0
  8. databricks/labs/lakebridge/base_install.py +12 -0
  9. databricks/labs/lakebridge/cli.py +449 -0
  10. databricks/labs/lakebridge/config.py +192 -0
  11. databricks/labs/lakebridge/connections/__init__.py +0 -0
  12. databricks/labs/lakebridge/connections/credential_manager.py +89 -0
  13. databricks/labs/lakebridge/connections/database_manager.py +98 -0
  14. databricks/labs/lakebridge/connections/env_getter.py +13 -0
  15. databricks/labs/lakebridge/contexts/__init__.py +0 -0
  16. databricks/labs/lakebridge/contexts/application.py +133 -0
  17. databricks/labs/lakebridge/coverage/__init__.py +0 -0
  18. databricks/labs/lakebridge/coverage/commons.py +223 -0
  19. databricks/labs/lakebridge/coverage/lakebridge_snow_transpilation_coverage.py +29 -0
  20. databricks/labs/lakebridge/coverage/local_report.py +9 -0
  21. databricks/labs/lakebridge/coverage/sqlglot_snow_transpilation_coverage.py +5 -0
  22. databricks/labs/lakebridge/coverage/sqlglot_tsql_transpilation_coverage.py +5 -0
  23. databricks/labs/lakebridge/deployment/__init__.py +0 -0
  24. databricks/labs/lakebridge/deployment/configurator.py +199 -0
  25. databricks/labs/lakebridge/deployment/dashboard.py +140 -0
  26. databricks/labs/lakebridge/deployment/installation.py +125 -0
  27. databricks/labs/lakebridge/deployment/job.py +147 -0
  28. databricks/labs/lakebridge/deployment/recon.py +145 -0
  29. databricks/labs/lakebridge/deployment/table.py +30 -0
  30. databricks/labs/lakebridge/deployment/upgrade_common.py +124 -0
  31. databricks/labs/lakebridge/discovery/table.py +36 -0
  32. databricks/labs/lakebridge/discovery/table_definition.py +23 -0
  33. databricks/labs/lakebridge/discovery/tsql_table_definition.py +185 -0
  34. databricks/labs/lakebridge/errors/exceptions.py +1 -0
  35. databricks/labs/lakebridge/helpers/__init__.py +0 -0
  36. databricks/labs/lakebridge/helpers/db_sql.py +24 -0
  37. databricks/labs/lakebridge/helpers/execution_time.py +20 -0
  38. databricks/labs/lakebridge/helpers/file_utils.py +64 -0
  39. databricks/labs/lakebridge/helpers/metastore.py +164 -0
  40. databricks/labs/lakebridge/helpers/recon_config_utils.py +176 -0
  41. databricks/labs/lakebridge/helpers/string_utils.py +62 -0
  42. databricks/labs/lakebridge/helpers/telemetry_utils.py +13 -0
  43. databricks/labs/lakebridge/helpers/validation.py +101 -0
  44. databricks/labs/lakebridge/install.py +849 -0
  45. databricks/labs/lakebridge/intermediate/__init__.py +0 -0
  46. databricks/labs/lakebridge/intermediate/dag.py +88 -0
  47. databricks/labs/lakebridge/intermediate/engine_adapter.py +0 -0
  48. databricks/labs/lakebridge/intermediate/root_tables.py +44 -0
  49. databricks/labs/lakebridge/jvmproxy.py +56 -0
  50. databricks/labs/lakebridge/lineage.py +42 -0
  51. databricks/labs/lakebridge/reconcile/__init__.py +0 -0
  52. databricks/labs/lakebridge/reconcile/compare.py +414 -0
  53. databricks/labs/lakebridge/reconcile/connectors/__init__.py +0 -0
  54. databricks/labs/lakebridge/reconcile/connectors/data_source.py +72 -0
  55. databricks/labs/lakebridge/reconcile/connectors/databricks.py +87 -0
  56. databricks/labs/lakebridge/reconcile/connectors/jdbc_reader.py +41 -0
  57. databricks/labs/lakebridge/reconcile/connectors/oracle.py +108 -0
  58. databricks/labs/lakebridge/reconcile/connectors/secrets.py +30 -0
  59. databricks/labs/lakebridge/reconcile/connectors/snowflake.py +173 -0
  60. databricks/labs/lakebridge/reconcile/connectors/source_adapter.py +30 -0
  61. databricks/labs/lakebridge/reconcile/connectors/sql_server.py +132 -0
  62. databricks/labs/lakebridge/reconcile/constants.py +37 -0
  63. databricks/labs/lakebridge/reconcile/exception.py +42 -0
  64. databricks/labs/lakebridge/reconcile/execute.py +920 -0
  65. databricks/labs/lakebridge/reconcile/query_builder/__init__.py +0 -0
  66. databricks/labs/lakebridge/reconcile/query_builder/aggregate_query.py +293 -0
  67. databricks/labs/lakebridge/reconcile/query_builder/base.py +138 -0
  68. databricks/labs/lakebridge/reconcile/query_builder/count_query.py +33 -0
  69. databricks/labs/lakebridge/reconcile/query_builder/expression_generator.py +292 -0
  70. databricks/labs/lakebridge/reconcile/query_builder/hash_query.py +91 -0
  71. databricks/labs/lakebridge/reconcile/query_builder/sampling_query.py +123 -0
  72. databricks/labs/lakebridge/reconcile/query_builder/threshold_query.py +231 -0
  73. databricks/labs/lakebridge/reconcile/recon_capture.py +635 -0
  74. databricks/labs/lakebridge/reconcile/recon_config.py +363 -0
  75. databricks/labs/lakebridge/reconcile/recon_output_config.py +85 -0
  76. databricks/labs/lakebridge/reconcile/runner.py +97 -0
  77. databricks/labs/lakebridge/reconcile/sampler.py +239 -0
  78. databricks/labs/lakebridge/reconcile/schema_compare.py +126 -0
  79. databricks/labs/lakebridge/resources/__init__.py +0 -0
  80. databricks/labs/lakebridge/resources/config/credentials.yml +33 -0
  81. databricks/labs/lakebridge/resources/reconcile/__init__.py +0 -0
  82. databricks/labs/lakebridge/resources/reconcile/dashboards/__init__.py +0 -0
  83. databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/00_0_aggregate_recon_header.md +6 -0
  84. databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/01_0_recon_id.filter.yml +6 -0
  85. databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/01_1_executed_by.filter.yml +5 -0
  86. databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/01_2_started_at.filter.yml +5 -0
  87. databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/02_0_source_type.filter.yml +5 -0
  88. databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/02_1_source_table.filter.yml +5 -0
  89. databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/02_2_target_table.filter.yml +5 -0
  90. databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/04_0_aggregate_summary_table.sql +46 -0
  91. databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/05_0_aggregate_recon_drilldown_header.md +2 -0
  92. databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/06_0_recon_id.filter.yml +5 -0
  93. databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/06_1_category.filter.yml +5 -0
  94. databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/06_2_aggregate_type.filter.yml +5 -0
  95. databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/07_0_target_table.filter.yml +4 -0
  96. databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/07_1_source_table.filter.yml +4 -0
  97. databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/08_0_aggregate_details_table.sql +92 -0
  98. databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/09_0_aggregate_missing_mismatch_header.md +1 -0
  99. databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/10_0_aggr_mismatched_records.sql +19 -0
  100. databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/11_0_aggr_missing_in_databricks.sql +19 -0
  101. databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/11_1_aggr_missing_in_source.sql +19 -0
  102. databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/dashboard.yml +365 -0
  103. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/00_0_recon_main.md +3 -0
  104. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/01_0_recon_id.filter.yml +6 -0
  105. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/01_1_report_type.filter.yml +5 -0
  106. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/01_2_executed_by.filter.yml +5 -0
  107. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/02_0_source_type.filter.yml +5 -0
  108. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/02_1_source_table.filter.yml +6 -0
  109. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/02_2_target_table.filter.yml +6 -0
  110. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/03_0_started_at.filter.yml +5 -0
  111. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/05_0_summary_table.sql +38 -0
  112. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/06_0_schema_comparison_header.md +3 -0
  113. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/07_0_schema_details_table.sql +42 -0
  114. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/08_0_drill_down_header.md +3 -0
  115. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/09_0_recon_id.filter.yml +4 -0
  116. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/09_1_category.filter.yml +4 -0
  117. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/10_0_target_table.filter.yml +4 -0
  118. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/10_1_source_table.filter.yml +4 -0
  119. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/11_0_recon_details_pivot.sql +40 -0
  120. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/12_0_daily_data_validation_issue_header.md +3 -0
  121. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/13_0_success_fail_.filter.yml +4 -0
  122. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/14_0_failed_recon_ids.sql +15 -0
  123. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/15_0_total_failed_runs.sql +10 -0
  124. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/15_1_failed_targets.sql +10 -0
  125. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/15_2_successful_targets.sql +10 -0
  126. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/16_0_missing_mismatch_header.md +1 -0
  127. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/17_0_mismatched_records.sql +14 -0
  128. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/17_1_threshold_mismatches.sql +14 -0
  129. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/18_0_missing_in_databricks.sql +14 -0
  130. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/18_1_missing_in_source.sql +14 -0
  131. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/dashboard.yml +545 -0
  132. databricks/labs/lakebridge/resources/reconcile/queries/__init__.py +0 -0
  133. databricks/labs/lakebridge/resources/reconcile/queries/installation/__init__.py +0 -0
  134. databricks/labs/lakebridge/resources/reconcile/queries/installation/aggregate_details.sql +7 -0
  135. databricks/labs/lakebridge/resources/reconcile/queries/installation/aggregate_metrics.sql +15 -0
  136. databricks/labs/lakebridge/resources/reconcile/queries/installation/aggregate_rules.sql +6 -0
  137. databricks/labs/lakebridge/resources/reconcile/queries/installation/details.sql +7 -0
  138. databricks/labs/lakebridge/resources/reconcile/queries/installation/main.sql +24 -0
  139. databricks/labs/lakebridge/resources/reconcile/queries/installation/metrics.sql +21 -0
  140. databricks/labs/lakebridge/transpiler/__init__.py +0 -0
  141. databricks/labs/lakebridge/transpiler/execute.py +423 -0
  142. databricks/labs/lakebridge/transpiler/lsp/__init__.py +0 -0
  143. databricks/labs/lakebridge/transpiler/lsp/lsp_engine.py +564 -0
  144. databricks/labs/lakebridge/transpiler/sqlglot/__init__.py +0 -0
  145. databricks/labs/lakebridge/transpiler/sqlglot/dialect_utils.py +30 -0
  146. databricks/labs/lakebridge/transpiler/sqlglot/generator/__init__.py +0 -0
  147. databricks/labs/lakebridge/transpiler/sqlglot/generator/databricks.py +771 -0
  148. databricks/labs/lakebridge/transpiler/sqlglot/lca_utils.py +138 -0
  149. databricks/labs/lakebridge/transpiler/sqlglot/local_expression.py +197 -0
  150. databricks/labs/lakebridge/transpiler/sqlglot/parsers/__init__.py +0 -0
  151. databricks/labs/lakebridge/transpiler/sqlglot/parsers/oracle.py +23 -0
  152. databricks/labs/lakebridge/transpiler/sqlglot/parsers/presto.py +202 -0
  153. databricks/labs/lakebridge/transpiler/sqlglot/parsers/snowflake.py +535 -0
  154. databricks/labs/lakebridge/transpiler/sqlglot/sqlglot_engine.py +203 -0
  155. databricks/labs/lakebridge/transpiler/transpile_engine.py +49 -0
  156. databricks/labs/lakebridge/transpiler/transpile_status.py +68 -0
  157. databricks/labs/lakebridge/uninstall.py +28 -0
  158. databricks/labs/lakebridge/upgrades/v0.4.0_add_main_table_operation_name_column.py +80 -0
  159. databricks/labs/lakebridge/upgrades/v0.6.0_alter_metrics_datatype.py +51 -0
  160. databricks_labs_lakebridge-0.10.0.dist-info/METADATA +58 -0
  161. databricks_labs_lakebridge-0.10.0.dist-info/RECORD +171 -0
  162. databricks_labs_lakebridge-0.10.0.dist-info/WHEEL +4 -0
  163. databricks_labs_lakebridge-0.10.0.dist-info/entry_points.txt +2 -0
  164. databricks_labs_lakebridge-0.10.0.dist-info/licenses/LICENSE +69 -0
  165. databricks_labs_lakebridge-0.10.0.dist-info/licenses/NOTICE +42 -0
  166. docs/lakebridge/src/components/Button.tsx +81 -0
  167. docs/lakebridge/src/css/custom.css +167 -0
  168. docs/lakebridge/src/css/table.css +20 -0
  169. docs/lakebridge/src/pages/index.tsx +57 -0
  170. docs/lakebridge/src/theme/Footer/index.tsx +24 -0
  171. docs/lakebridge/src/theme/Layout/index.tsx +18 -0
@@ -0,0 +1,239 @@
1
+ import logging
2
+ from abc import ABC, abstractmethod
3
+
4
+ from pyspark.sql import DataFrame
5
+ from pyspark.sql import functions as F
6
+
7
+ from databricks.labs.lakebridge.reconcile.constants import SamplingOptionMethod, SamplingSpecificationsType
8
+ from databricks.labs.lakebridge.reconcile.recon_config import SamplingOptions, SamplingSpecifications
9
+
10
+ logger = logging.getLogger(__name__)
11
+
12
+ _MIN_SAMPLE_COUNT = 50
13
+ _MAX_SAMPLE_COUNT = 400
14
+
15
+ _MIN_BUCKET_LIMIT = 2
16
+ _MAX_BUCKET_LIMIT = 50
17
+
18
+
19
+ class Sampler(ABC):
20
+ def __init__(self, sampling_options: SamplingOptions):
21
+ self._sampling_options = sampling_options
22
+
23
+ @abstractmethod
24
+ def _validate_sampling_options(self):
25
+ return NotImplemented
26
+
27
+ @abstractmethod
28
+ def sample(
29
+ self,
30
+ keys_df: DataFrame,
31
+ keys_df_count: int,
32
+ key_columns: list[str],
33
+ target_table: DataFrame,
34
+ ) -> DataFrame:
35
+ return NotImplemented
36
+
37
+
38
+ class RandomSampler(Sampler):
39
+ """
40
+ RandomSampler is a subclass of Sampler that performs random sampling on a given DataFrame.
41
+ """
42
+
43
+ def __init__(self, sampling_options: SamplingOptions, seed: int = 100):
44
+ """
45
+ Attributes:
46
+ sampling_options (SamplingOptions): The options for sampling, including method, specifications.
47
+ seed (int): The seed for random sampling to ensure reproducibility.
48
+ """
49
+ super().__init__(sampling_options)
50
+ self.seed = seed
51
+
52
+ def _validate_sampling_options(self):
53
+ if self._sampling_options.method != SamplingOptionMethod.RANDOM:
54
+ raise ValueError("RandomSampler: Only 'random' sampling method is supported")
55
+
56
+ specs = self._sampling_options.specifications
57
+ if specs.type == SamplingSpecificationsType.COUNT and (
58
+ specs.value is None or (specs.value < _MIN_SAMPLE_COUNT)
59
+ ):
60
+ logger.info(
61
+ f"RandomSampler: Sample count must be >= {_MIN_SAMPLE_COUNT}, " f"flooring to {_MIN_SAMPLE_COUNT}"
62
+ )
63
+ self._sampling_options.specifications.value = _MIN_SAMPLE_COUNT
64
+
65
+ elif specs.type == SamplingSpecificationsType.COUNT and specs.value > _MAX_SAMPLE_COUNT:
66
+ logger.info(
67
+ f"RandomSampler: Sample count must be <= {_MAX_SAMPLE_COUNT}, " f"capping to {_MAX_SAMPLE_COUNT}"
68
+ )
69
+ self._sampling_options.specifications.value = _MAX_SAMPLE_COUNT
70
+
71
+ def sample(
72
+ self, keys_df: DataFrame, keys_df_count: int, key_columns: list[str], target_table: DataFrame
73
+ ) -> DataFrame:
74
+ """
75
+ Performs random sampling on the given DataFrame based on the specified options.
76
+ - Validates the sampling options.
77
+ - Uses pre-calculated `keys_df_count` from `reconcile_output.mismatch_count` to avoid from recomputing `keys_df`.
78
+ - If the specifications type is FRACTION, samples the DataFrame based on the fraction value.
79
+ - If the specifications type is COUNT, calculates the fraction and samples the DataFrame accordingly,
80
+ then limits the sample size to the specified count.
81
+ - Returns the sampled DataFrame.
82
+ """
83
+
84
+ self._validate_sampling_options()
85
+ specs = self._sampling_options.specifications
86
+
87
+ default_sampled_df = keys_df.limit(_MIN_SAMPLE_COUNT)
88
+
89
+ if specs.type == SamplingSpecificationsType.FRACTION:
90
+ sampled_df = keys_df.sample(fraction=specs.value, seed=self.seed)
91
+ elif specs.type == SamplingSpecificationsType.COUNT:
92
+ total_count = keys_df_count
93
+ sample_size = int(specs.value)
94
+ fraction = min(1.0, sample_size / total_count)
95
+ sampled_df = keys_df.sample(fraction=fraction, seed=self.seed).limit(sample_size)
96
+ else:
97
+ return default_sampled_df
98
+
99
+ return sampled_df
100
+
101
+
102
+ class StratifiedSampler(Sampler):
103
+ """
104
+ StratifiedSampler is a subclass of Sampler that performs stratified sampling on a given DataFrame.
105
+ """
106
+
107
+ def __init__(self, sampling_options: SamplingOptions, seed: int = 100):
108
+ """
109
+ Attributes:
110
+ sampling_options (SamplingOptions): The options for sampling, including method, specifications,
111
+ stratified columns, and stratified buckets.
112
+ seed (int): The seed for random sampling to ensure reproducibility.
113
+ """
114
+ super().__init__(sampling_options)
115
+ self.seed = seed
116
+
117
+ def _validate_sampling_options(self):
118
+ if self._sampling_options.method != SamplingOptionMethod.STRATIFIED:
119
+ raise ValueError("StratifiedSampler: Only 'stratified' sampling method is supported")
120
+
121
+ specs = self._sampling_options.specifications
122
+ stratified_buckets = self._sampling_options.stratified_buckets
123
+
124
+ if specs.type == SamplingSpecificationsType.COUNT and (
125
+ specs.value is None or (specs.value < _MIN_SAMPLE_COUNT)
126
+ ):
127
+ logger.info(
128
+ f"StratifiedSampler: Sample count must be >= {_MIN_SAMPLE_COUNT}, " f"flooring to {_MIN_SAMPLE_COUNT}"
129
+ )
130
+ self._sampling_options.specifications.value = _MIN_SAMPLE_COUNT
131
+
132
+ elif specs.type == SamplingSpecificationsType.COUNT and specs.value > _MAX_SAMPLE_COUNT:
133
+ logger.info(
134
+ f"StratifiedSampler: Sample count must be <= {_MAX_SAMPLE_COUNT}, " f"capping to {_MAX_SAMPLE_COUNT}"
135
+ )
136
+ self._sampling_options.specifications.value = _MAX_SAMPLE_COUNT
137
+
138
+ if stratified_buckets < _MIN_BUCKET_LIMIT:
139
+ logger.info(
140
+ f"StratifiedSampler: Stratified buckets must be >= {_MIN_BUCKET_LIMIT}, "
141
+ f"flooring to {_MIN_BUCKET_LIMIT}"
142
+ )
143
+ self._sampling_options.stratified_buckets = _MIN_BUCKET_LIMIT
144
+ elif stratified_buckets > _MAX_BUCKET_LIMIT:
145
+ logger.info(
146
+ f"StratifiedSampler: Stratified buckets must be <= {_MAX_BUCKET_LIMIT}, "
147
+ f"capping to {_MAX_BUCKET_LIMIT}"
148
+ )
149
+ self._sampling_options.stratified_buckets = _MAX_BUCKET_LIMIT
150
+
151
+ def sample(
152
+ self, keys_df: DataFrame, keys_df_count: int, key_columns: list[str], target_table: DataFrame
153
+ ) -> DataFrame:
154
+ """
155
+ Performs stratified sampling on the given DataFrame based on the specified options.
156
+ - Joins the keys_df with the target_table on the key_columns.
157
+ - Creates a hash bucket column based on the stratified columns.
158
+ - Adds the bucket column to the joined DataFrame.
159
+ - If specifications type is FRACTION, calculates fractions for each bucket and samples accordingly.
160
+ - If specifications type is COUNT, calculates fractions for each bucket and samples accordingly,
161
+ then limits the sample size to the specified count.
162
+ - Returns the sampled DataFrame.
163
+ """
164
+ self._validate_sampling_options()
165
+
166
+ specs = self._sampling_options.specifications
167
+ stratified_columns = self._sampling_options.stratified_columns
168
+ non_key_stratified_columns = [
169
+ col for col in (self._sampling_options.stratified_columns or []) if col not in key_columns
170
+ ]
171
+ stratified_buckets = self._sampling_options.stratified_buckets or _MIN_BUCKET_LIMIT
172
+
173
+ keys_df.select(*key_columns)
174
+ default_sampled_df = keys_df.limit(_MIN_SAMPLE_COUNT)
175
+
176
+ # Join the mismatched_df with target_table_df
177
+ joined_df = keys_df.join(
178
+ target_table, [keys_df[col] == target_table[col] for col in key_columns], "inner"
179
+ ).select(*[keys_df[col] for col in key_columns], *[target_table[col] for col in non_key_stratified_columns])
180
+
181
+ # Create a hash bucket column based on the stratified columns
182
+ bucket_col = F.pmod(F.abs(F.hash(*[F.col(c) for c in (stratified_columns or [])])), stratified_buckets)
183
+
184
+ # Add the bucket column to the joined_df
185
+ bucketed_df = joined_df.withColumn("bucket", bucket_col)
186
+
187
+ if specs.type == SamplingSpecificationsType.FRACTION:
188
+ # Calculate fractions for each bucket
189
+ unique_values = bucketed_df.select("bucket").distinct().collect()
190
+ fractions = {row["bucket"]: specs.value for row in unique_values}
191
+ sampled_df = bucketed_df.sampleBy("bucket", fractions=fractions, seed=self.seed)
192
+
193
+ elif specs.type == SamplingSpecificationsType.COUNT:
194
+ # Calculate fractions for each bucket
195
+ sample_size = int(specs.value)
196
+ bucket_counts = bucketed_df.groupBy("bucket").count().collect()
197
+ total_count = sum(row['count'] for row in bucket_counts)
198
+ fractions = {
199
+ row["bucket"]: min(1.0, (sample_size * row['count'] / total_count) / row['count'])
200
+ for row in bucket_counts
201
+ }
202
+ sampled_df = bucketed_df.sampleBy("bucket", fractions=fractions, seed=self.seed).limit(sample_size)
203
+
204
+ else:
205
+ return default_sampled_df.select(*key_columns)
206
+
207
+ return sampled_df.select(*key_columns)
208
+
209
+
210
+ # TODO: Move away from SamplerFactory to a context-driven approach
211
+ class SamplerFactory:
212
+ @staticmethod
213
+ def get_sampler(sampling_options: SamplingOptions, seed: int = 100) -> Sampler:
214
+ # If no sampling options provided, use default
215
+ if sampling_options is None:
216
+ default_sampling_options = SamplingOptions(
217
+ method=SamplingOptionMethod.RANDOM,
218
+ specifications=SamplingSpecifications(type=SamplingSpecificationsType.COUNT, value=_MIN_SAMPLE_COUNT),
219
+ stratified_columns=None,
220
+ stratified_buckets=None,
221
+ )
222
+ logger.info(
223
+ f"SamplerFactory: No sampling options provided, using default options: " f"{default_sampling_options}"
224
+ )
225
+ sampling_options = default_sampling_options
226
+
227
+ else:
228
+ logger.info(f"SamplerFactory: Creating sampler using provided options: " f"{sampling_options}")
229
+
230
+ # Use a dictionary-based dispatch for better extensibility
231
+ sampler_map = {SamplingOptionMethod.RANDOM: RandomSampler, SamplingOptionMethod.STRATIFIED: StratifiedSampler}
232
+
233
+ # Get the sampler class
234
+ sampler_class = sampler_map.get(sampling_options.method)
235
+
236
+ if sampler_class is None:
237
+ raise ValueError(f"SamplerFactory : Unsupported sampling method: {sampling_options.method}")
238
+
239
+ return sampler_class(sampling_options, seed)
@@ -0,0 +1,126 @@
1
+ import logging
2
+ from dataclasses import asdict
3
+
4
+ from pyspark.sql import DataFrame, SparkSession
5
+ from pyspark.sql.types import BooleanType, StringType, StructField, StructType
6
+ from sqlglot import Dialect, parse_one
7
+
8
+ from databricks.labs.lakebridge.transpiler.sqlglot.dialect_utils import get_dialect
9
+ from databricks.labs.lakebridge.reconcile.recon_config import Schema, Table
10
+ from databricks.labs.lakebridge.reconcile.recon_output_config import SchemaMatchResult, SchemaReconcileOutput
11
+ from databricks.labs.lakebridge.transpiler.sqlglot.generator.databricks import Databricks
12
+
13
+ logger = logging.getLogger(__name__)
14
+
15
+
16
+ class SchemaCompare:
17
+ def __init__(
18
+ self,
19
+ spark: SparkSession,
20
+ ):
21
+ self.spark = spark
22
+
23
+ # Define the schema for the schema compare DataFrame
24
+ _schema_compare_schema: StructType = StructType(
25
+ [
26
+ StructField("source_column", StringType(), False),
27
+ StructField("source_datatype", StringType(), False),
28
+ StructField("databricks_column", StringType(), True),
29
+ StructField("databricks_datatype", StringType(), True),
30
+ StructField("is_valid", BooleanType(), False),
31
+ ]
32
+ )
33
+
34
+ @classmethod
35
+ def _build_master_schema(
36
+ cls,
37
+ source_schema: list[Schema],
38
+ databricks_schema: list[Schema],
39
+ table_conf: Table,
40
+ ) -> list[SchemaMatchResult]:
41
+ master_schema = source_schema
42
+ if table_conf.select_columns:
43
+ master_schema = [schema for schema in master_schema if schema.column_name in table_conf.select_columns]
44
+ if table_conf.drop_columns:
45
+ master_schema = [sschema for sschema in master_schema if sschema.column_name not in table_conf.drop_columns]
46
+
47
+ target_column_map = table_conf.to_src_col_map or {}
48
+ master_schema_match_res = [
49
+ SchemaMatchResult(
50
+ source_column=s.column_name,
51
+ databricks_column=target_column_map.get(s.column_name, s.column_name),
52
+ source_datatype=s.data_type,
53
+ databricks_datatype=next(
54
+ (
55
+ tgt.data_type
56
+ for tgt in databricks_schema
57
+ if tgt.column_name == target_column_map.get(s.column_name, s.column_name)
58
+ ),
59
+ "",
60
+ ),
61
+ )
62
+ for s in master_schema
63
+ ]
64
+ return master_schema_match_res
65
+
66
+ def _create_dataframe(self, data: list, schema: StructType) -> DataFrame:
67
+ """
68
+ :param data: Expectation is list of dataclass
69
+ :param schema: Target schema
70
+ :return: DataFrame
71
+ """
72
+ data = [tuple(asdict(item).values()) for item in data]
73
+ df = self.spark.createDataFrame(data, schema)
74
+
75
+ return df
76
+
77
+ @classmethod
78
+ def _parse(cls, source: Dialect, column: str, data_type: str) -> str:
79
+ return (
80
+ parse_one(f"create table dummy ({column} {data_type})", read=source)
81
+ .sql(dialect=get_dialect("databricks"))
82
+ .replace(", ", ",")
83
+ )
84
+
85
+ @classmethod
86
+ def _table_schema_status(cls, schema_compare_maps: list[SchemaMatchResult]) -> bool:
87
+ return bool(all(x.is_valid for x in schema_compare_maps))
88
+
89
+ @classmethod
90
+ def _validate_parsed_query(cls, master: SchemaMatchResult, parsed_query) -> None:
91
+ databricks_query = f"create table dummy ({master.source_column} {master.databricks_datatype})"
92
+ logger.info(
93
+ f"""
94
+ Source datatype: create table dummy ({master.source_column} {master.source_datatype})
95
+ Parse datatype: {parsed_query}
96
+ Databricks datatype: {databricks_query}
97
+ """
98
+ )
99
+ if parsed_query.lower() != databricks_query.lower():
100
+ master.is_valid = False
101
+
102
+ def compare(
103
+ self,
104
+ source_schema: list[Schema],
105
+ databricks_schema: list[Schema],
106
+ source: Dialect,
107
+ table_conf: Table,
108
+ ) -> SchemaReconcileOutput:
109
+ """
110
+ This method compares the source schema and the Databricks schema. It checks if the data types of the columns in the source schema
111
+ match with the corresponding columns in the Databricks schema by parsing using remorph transpile.
112
+
113
+ Returns:
114
+ SchemaReconcileOutput: A dataclass object containing a boolean indicating the overall result of the comparison and a DataFrame with the comparison details.
115
+ """
116
+ master_schema = self._build_master_schema(source_schema, databricks_schema, table_conf)
117
+ for master in master_schema:
118
+ if not isinstance(source, Databricks):
119
+ parsed_query = self._parse(source, master.source_column, master.source_datatype)
120
+ self._validate_parsed_query(master, parsed_query)
121
+ elif master.source_datatype.lower() != master.databricks_datatype.lower():
122
+ master.is_valid = False
123
+
124
+ df = self._create_dataframe(master_schema, self._schema_compare_schema)
125
+ final_result = self._table_schema_status(master_schema)
126
+ return SchemaReconcileOutput(final_result, df)
File without changes
@@ -0,0 +1,33 @@
1
+ secret_vault_type: local | databricks | env
2
+ secret_vault_name: null
3
+ snowflake:
4
+ account: example_account
5
+ connect_retries: 1
6
+ connect_timeout: null
7
+ host: null
8
+ insecure_mode: false
9
+ oauth_client_id: null
10
+ oauth_client_secret: null
11
+ password: null
12
+ port: null
13
+ private_key: null
14
+ private_key_passphrase: null
15
+ private_key_path: null
16
+ role: null
17
+ token: null
18
+ user: null
19
+ warehouse: null
20
+
21
+ mssql:
22
+ #TODO Expand to support sqlpools, and legacy dwh
23
+ database: DB_NAME
24
+ driver: ODBC Driver 18 for SQL Server
25
+ server: example_host
26
+ port: null
27
+ user: null
28
+ password: null
29
+
30
+
31
+
32
+
33
+
@@ -0,0 +1,6 @@
1
+ # Aggregates Reconcile Table Metrics
2
+ ### It provides the following information:
3
+
4
+ * Mismatch
5
+ * Missing in Source
6
+ * Missing in Target
@@ -0,0 +1,6 @@
1
+ columns:
2
+ - recon_id
3
+ - dd_recon_id
4
+ type: MULTI_SELECT
5
+ title: Recon Id
6
+ width: 2
@@ -0,0 +1,5 @@
1
+ columns:
2
+ - executed_by
3
+ type: MULTI_SELECT
4
+ title: Executed by
5
+ width: 2
@@ -0,0 +1,5 @@
1
+ columns:
2
+ - start_ts
3
+ title: Started At
4
+ type: DATE_RANGE_PICKER
5
+ width: 2
@@ -0,0 +1,5 @@
1
+ columns:
2
+ - source_type
3
+ type: MULTI_SELECT
4
+ title: Source Type
5
+ width: 2
@@ -0,0 +1,5 @@
1
+ columns:
2
+ - source_table
3
+ type: MULTI_SELECT
4
+ title: Source Table Name
5
+ width: 2
@@ -0,0 +1,5 @@
1
+ columns:
2
+ - target_table
3
+ type: MULTI_SELECT
4
+ title: Target Table Name
5
+ width: 2
@@ -0,0 +1,46 @@
1
+ /* --title 'Aggregates Summary Table' --width 6 --height 6 */
2
+ SELECT
3
+ main.recon_id,
4
+ main.source_type,
5
+ main.source_table.`catalog` AS source_catalog,
6
+ main.source_table.`schema` AS source_schema,
7
+ main.source_table.table_name AS source_table_name,
8
+ IF(
9
+ ISNULL(source_catalog),
10
+ CONCAT_WS('.', source_schema, source_table_name),
11
+ CONCAT_WS(
12
+ '.',
13
+ source_catalog,
14
+ source_schema,
15
+ source_table_name
16
+ )
17
+ ) AS source_table,
18
+ main.target_table.`catalog` AS target_catalog,
19
+ main.target_table.`schema` AS target_schema,
20
+ main.target_table.table_name AS target_table_name,
21
+ CONCAT_WS(
22
+ '.',
23
+ target_catalog,
24
+ target_schema,
25
+ target_table_name
26
+ ) AS target_table,
27
+ UPPER(rules.rule_info.agg_type) || CONCAT('(', rules.rule_info.agg_column, ')') AS aggregate_column,
28
+ rules.rule_info.group_by_columns,
29
+ metrics.run_metrics.status AS status,
30
+ metrics.run_metrics.exception_message AS exception,
31
+ metrics.recon_metrics.missing_in_source AS missing_in_source,
32
+ metrics.recon_metrics.missing_in_target AS missing_in_target,
33
+ metrics.recon_metrics.mismatch AS mismatch,
34
+ metrics.run_metrics.run_by_user AS executed_by,
35
+ main.start_ts AS start_ts,
36
+ main.end_ts AS end_ts
37
+ FROM
38
+ remorph.reconcile.main main
39
+ INNER JOIN remorph.reconcile.aggregate_metrics metrics
40
+ INNER JOIN remorph.reconcile.aggregate_rules rules
41
+ ON main.recon_table_id = metrics.recon_table_id
42
+ AND rules.rule_id = metrics.rule_id
43
+ ORDER BY
44
+ metrics.inserted_ts DESC,
45
+ main.recon_id,
46
+ main.target_table.table_name
@@ -0,0 +1,2 @@
1
+ # Drill Down
2
+ ### The Aggregates Reconcile details table contains all the sample records information of mismatches and missing entries.
@@ -0,0 +1,5 @@
1
+ columns:
2
+ - dd_recon_id
3
+ type: MULTI_SELECT
4
+ title: Recon Id
5
+ width: 2
@@ -0,0 +1,5 @@
1
+ columns:
2
+ - dd_recon_type
3
+ type: MULTI_SELECT
4
+ title: Category
5
+ width: 2
@@ -0,0 +1,5 @@
1
+ columns:
2
+ - dd_aggregate_type
3
+ type: MULTI_SELECT
4
+ title: Aggregate Type
5
+ width: 2
@@ -0,0 +1,4 @@
1
+ columns:
2
+ - dd_target_table
3
+ type: MULTI_SELECT
4
+ title: Target Table Name
@@ -0,0 +1,4 @@
1
+ columns:
2
+ - dd_source_table
3
+ type: MULTI_SELECT
4
+ title: Source Table Name
@@ -0,0 +1,92 @@
1
+ /* --title 'Aggregates Reconciliation Details' --width 6 --height 6 */
2
+ WITH details_view AS (
3
+ SELECT
4
+ recon_table_id,
5
+ rule_id,
6
+ recon_type,
7
+ explode(data) AS agg_details
8
+ FROM
9
+ remorph.reconcile.aggregate_details
10
+ ),
11
+ metrics_view AS (
12
+ SELECT
13
+ recon_table_id,
14
+ rule_id,
15
+ recon_metrics,
16
+ run_metrics
17
+ FROM
18
+ remorph.reconcile.aggregate_metrics
19
+ )
20
+ SELECT
21
+ recon_id AS dd_recon_id,
22
+ source_table AS dd_source_table,
23
+ target_table AS dd_target_table,
24
+ recon_type AS dd_recon_type,
25
+ aggregate_type AS dd_aggregate_type,
26
+ rule AS aggregate_column,
27
+ source_value,
28
+ target_value,
29
+ zip_with(rule_group_by_columns, group_by_column_values, (groupby, value) -> CONCAT_WS(':', TRIM(groupby), value)) AS group_by_columns,
30
+ COALESCE(status, 'false') AS status
31
+ FROM (
32
+ SELECT
33
+ main.recon_id,
34
+ main.source_table.`catalog` AS source_catalog,
35
+ main.source_table.`schema` AS source_schema,
36
+ main.source_table.table_name AS source_table_name,
37
+ IF(
38
+ ISNULL(source_catalog),
39
+ CONCAT_WS('.', source_schema, source_table_name),
40
+ CONCAT_WS(
41
+ '.',
42
+ source_catalog,
43
+ source_schema,
44
+ source_table_name
45
+ )
46
+ ) AS source_table,
47
+ main.target_table.`catalog` AS target_catalog,
48
+ main.target_table.`schema` AS target_schema,
49
+ main.target_table.table_name AS target_table_name,
50
+ CONCAT_WS(
51
+ '.',
52
+ target_catalog,
53
+ target_schema,
54
+ target_table_name
55
+ ) AS target_table,
56
+ dtl.recon_type,
57
+ rul.rule_info.agg_type AS aggregate_type,
58
+ UPPER(rul.rule_info.agg_type) || CONCAT('(', rul.rule_info.agg_column, ')') AS rule,
59
+ CONCAT_WS(
60
+ '_',
61
+ 'source',
62
+ rul.rule_info.agg_type,
63
+ rul.rule_info.agg_column
64
+ ) AS source_agg_column,
65
+ dtl.agg_details[source_agg_column] AS source_value,
66
+ CONCAT_WS(
67
+ '_',
68
+ 'target',
69
+ rul.rule_info.agg_type,
70
+ rul.rule_info.agg_column
71
+ ) AS target_agg_column,
72
+ dtl.agg_details[target_agg_column] AS target_value,
73
+ SPLIT(rul.rule_info.group_by_columns, ',') AS rule_group_by_columns,
74
+ TRANSFORM(rule_group_by_columns, colm ->
75
+ COALESCE(dtl.agg_details[CONCAT('source_group_by_', TRIM(colm))],
76
+ dtl.agg_details[CONCAT('target_group_by_', TRIM(colm))])) AS group_by_column_values,
77
+ CONCAT_WS(
78
+ '_',
79
+ 'match',
80
+ rul.rule_info.agg_type,
81
+ rul.rule_info.agg_column
82
+ ) AS status_column,
83
+ dtl.agg_details[status_column] AS status
84
+ FROM
85
+ metrics_view mtc
86
+ INNER JOIN remorph.reconcile.main main ON main.recon_table_id = mtc.recon_table_id
87
+ INNER JOIN details_view dtl ON mtc.recon_table_id = dtl.recon_table_id
88
+ INNER JOIN remorph.reconcile.aggregate_rules rul ON mtc.rule_id = dtl.rule_id
89
+ AND dtl.rule_id = rul.rule_id
90
+ )
91
+ ORDER BY
92
+ recon_id
@@ -0,0 +1,19 @@
1
+ /* --title 'Mismatched Records' --width 6 */
2
+ SELECT
3
+ main.recon_id,
4
+ CONCAT_WS(
5
+ '.',
6
+ main.target_table.`catalog`,
7
+ main.target_table.`schema`,
8
+ main.target_table.table_name
9
+ ) AS target_table,
10
+ main.start_ts,
11
+ metrics.recon_metrics.mismatch AS mismatch
12
+ FROM
13
+ remorph.reconcile.main main
14
+ INNER JOIN remorph.reconcile.aggregate_metrics metrics
15
+ ON main.recon_table_id = metrics.recon_table_id
16
+ ORDER BY
17
+ metrics.inserted_ts DESC,
18
+ main.recon_id,
19
+ main.target_table.table_name