databricks-labs-lakebridge 0.10.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- databricks/__init__.py +3 -0
- databricks/labs/__init__.py +3 -0
- databricks/labs/lakebridge/__about__.py +2 -0
- databricks/labs/lakebridge/__init__.py +11 -0
- databricks/labs/lakebridge/assessments/configure_assessment.py +194 -0
- databricks/labs/lakebridge/assessments/pipeline.py +188 -0
- databricks/labs/lakebridge/assessments/profiler_config.py +30 -0
- databricks/labs/lakebridge/base_install.py +12 -0
- databricks/labs/lakebridge/cli.py +449 -0
- databricks/labs/lakebridge/config.py +192 -0
- databricks/labs/lakebridge/connections/__init__.py +0 -0
- databricks/labs/lakebridge/connections/credential_manager.py +89 -0
- databricks/labs/lakebridge/connections/database_manager.py +98 -0
- databricks/labs/lakebridge/connections/env_getter.py +13 -0
- databricks/labs/lakebridge/contexts/__init__.py +0 -0
- databricks/labs/lakebridge/contexts/application.py +133 -0
- databricks/labs/lakebridge/coverage/__init__.py +0 -0
- databricks/labs/lakebridge/coverage/commons.py +223 -0
- databricks/labs/lakebridge/coverage/lakebridge_snow_transpilation_coverage.py +29 -0
- databricks/labs/lakebridge/coverage/local_report.py +9 -0
- databricks/labs/lakebridge/coverage/sqlglot_snow_transpilation_coverage.py +5 -0
- databricks/labs/lakebridge/coverage/sqlglot_tsql_transpilation_coverage.py +5 -0
- databricks/labs/lakebridge/deployment/__init__.py +0 -0
- databricks/labs/lakebridge/deployment/configurator.py +199 -0
- databricks/labs/lakebridge/deployment/dashboard.py +140 -0
- databricks/labs/lakebridge/deployment/installation.py +125 -0
- databricks/labs/lakebridge/deployment/job.py +147 -0
- databricks/labs/lakebridge/deployment/recon.py +145 -0
- databricks/labs/lakebridge/deployment/table.py +30 -0
- databricks/labs/lakebridge/deployment/upgrade_common.py +124 -0
- databricks/labs/lakebridge/discovery/table.py +36 -0
- databricks/labs/lakebridge/discovery/table_definition.py +23 -0
- databricks/labs/lakebridge/discovery/tsql_table_definition.py +185 -0
- databricks/labs/lakebridge/errors/exceptions.py +1 -0
- databricks/labs/lakebridge/helpers/__init__.py +0 -0
- databricks/labs/lakebridge/helpers/db_sql.py +24 -0
- databricks/labs/lakebridge/helpers/execution_time.py +20 -0
- databricks/labs/lakebridge/helpers/file_utils.py +64 -0
- databricks/labs/lakebridge/helpers/metastore.py +164 -0
- databricks/labs/lakebridge/helpers/recon_config_utils.py +176 -0
- databricks/labs/lakebridge/helpers/string_utils.py +62 -0
- databricks/labs/lakebridge/helpers/telemetry_utils.py +13 -0
- databricks/labs/lakebridge/helpers/validation.py +101 -0
- databricks/labs/lakebridge/install.py +849 -0
- databricks/labs/lakebridge/intermediate/__init__.py +0 -0
- databricks/labs/lakebridge/intermediate/dag.py +88 -0
- databricks/labs/lakebridge/intermediate/engine_adapter.py +0 -0
- databricks/labs/lakebridge/intermediate/root_tables.py +44 -0
- databricks/labs/lakebridge/jvmproxy.py +56 -0
- databricks/labs/lakebridge/lineage.py +42 -0
- databricks/labs/lakebridge/reconcile/__init__.py +0 -0
- databricks/labs/lakebridge/reconcile/compare.py +414 -0
- databricks/labs/lakebridge/reconcile/connectors/__init__.py +0 -0
- databricks/labs/lakebridge/reconcile/connectors/data_source.py +72 -0
- databricks/labs/lakebridge/reconcile/connectors/databricks.py +87 -0
- databricks/labs/lakebridge/reconcile/connectors/jdbc_reader.py +41 -0
- databricks/labs/lakebridge/reconcile/connectors/oracle.py +108 -0
- databricks/labs/lakebridge/reconcile/connectors/secrets.py +30 -0
- databricks/labs/lakebridge/reconcile/connectors/snowflake.py +173 -0
- databricks/labs/lakebridge/reconcile/connectors/source_adapter.py +30 -0
- databricks/labs/lakebridge/reconcile/connectors/sql_server.py +132 -0
- databricks/labs/lakebridge/reconcile/constants.py +37 -0
- databricks/labs/lakebridge/reconcile/exception.py +42 -0
- databricks/labs/lakebridge/reconcile/execute.py +920 -0
- databricks/labs/lakebridge/reconcile/query_builder/__init__.py +0 -0
- databricks/labs/lakebridge/reconcile/query_builder/aggregate_query.py +293 -0
- databricks/labs/lakebridge/reconcile/query_builder/base.py +138 -0
- databricks/labs/lakebridge/reconcile/query_builder/count_query.py +33 -0
- databricks/labs/lakebridge/reconcile/query_builder/expression_generator.py +292 -0
- databricks/labs/lakebridge/reconcile/query_builder/hash_query.py +91 -0
- databricks/labs/lakebridge/reconcile/query_builder/sampling_query.py +123 -0
- databricks/labs/lakebridge/reconcile/query_builder/threshold_query.py +231 -0
- databricks/labs/lakebridge/reconcile/recon_capture.py +635 -0
- databricks/labs/lakebridge/reconcile/recon_config.py +363 -0
- databricks/labs/lakebridge/reconcile/recon_output_config.py +85 -0
- databricks/labs/lakebridge/reconcile/runner.py +97 -0
- databricks/labs/lakebridge/reconcile/sampler.py +239 -0
- databricks/labs/lakebridge/reconcile/schema_compare.py +126 -0
- databricks/labs/lakebridge/resources/__init__.py +0 -0
- databricks/labs/lakebridge/resources/config/credentials.yml +33 -0
- databricks/labs/lakebridge/resources/reconcile/__init__.py +0 -0
- databricks/labs/lakebridge/resources/reconcile/dashboards/__init__.py +0 -0
- databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/00_0_aggregate_recon_header.md +6 -0
- databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/01_0_recon_id.filter.yml +6 -0
- databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/01_1_executed_by.filter.yml +5 -0
- databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/01_2_started_at.filter.yml +5 -0
- databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/02_0_source_type.filter.yml +5 -0
- databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/02_1_source_table.filter.yml +5 -0
- databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/02_2_target_table.filter.yml +5 -0
- databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/04_0_aggregate_summary_table.sql +46 -0
- databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/05_0_aggregate_recon_drilldown_header.md +2 -0
- databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/06_0_recon_id.filter.yml +5 -0
- databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/06_1_category.filter.yml +5 -0
- databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/06_2_aggregate_type.filter.yml +5 -0
- databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/07_0_target_table.filter.yml +4 -0
- databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/07_1_source_table.filter.yml +4 -0
- databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/08_0_aggregate_details_table.sql +92 -0
- databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/09_0_aggregate_missing_mismatch_header.md +1 -0
- databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/10_0_aggr_mismatched_records.sql +19 -0
- databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/11_0_aggr_missing_in_databricks.sql +19 -0
- databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/11_1_aggr_missing_in_source.sql +19 -0
- databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/dashboard.yml +365 -0
- databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/00_0_recon_main.md +3 -0
- databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/01_0_recon_id.filter.yml +6 -0
- databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/01_1_report_type.filter.yml +5 -0
- databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/01_2_executed_by.filter.yml +5 -0
- databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/02_0_source_type.filter.yml +5 -0
- databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/02_1_source_table.filter.yml +6 -0
- databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/02_2_target_table.filter.yml +6 -0
- databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/03_0_started_at.filter.yml +5 -0
- databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/05_0_summary_table.sql +38 -0
- databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/06_0_schema_comparison_header.md +3 -0
- databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/07_0_schema_details_table.sql +42 -0
- databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/08_0_drill_down_header.md +3 -0
- databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/09_0_recon_id.filter.yml +4 -0
- databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/09_1_category.filter.yml +4 -0
- databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/10_0_target_table.filter.yml +4 -0
- databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/10_1_source_table.filter.yml +4 -0
- databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/11_0_recon_details_pivot.sql +40 -0
- databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/12_0_daily_data_validation_issue_header.md +3 -0
- databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/13_0_success_fail_.filter.yml +4 -0
- databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/14_0_failed_recon_ids.sql +15 -0
- databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/15_0_total_failed_runs.sql +10 -0
- databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/15_1_failed_targets.sql +10 -0
- databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/15_2_successful_targets.sql +10 -0
- databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/16_0_missing_mismatch_header.md +1 -0
- databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/17_0_mismatched_records.sql +14 -0
- databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/17_1_threshold_mismatches.sql +14 -0
- databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/18_0_missing_in_databricks.sql +14 -0
- databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/18_1_missing_in_source.sql +14 -0
- databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/dashboard.yml +545 -0
- databricks/labs/lakebridge/resources/reconcile/queries/__init__.py +0 -0
- databricks/labs/lakebridge/resources/reconcile/queries/installation/__init__.py +0 -0
- databricks/labs/lakebridge/resources/reconcile/queries/installation/aggregate_details.sql +7 -0
- databricks/labs/lakebridge/resources/reconcile/queries/installation/aggregate_metrics.sql +15 -0
- databricks/labs/lakebridge/resources/reconcile/queries/installation/aggregate_rules.sql +6 -0
- databricks/labs/lakebridge/resources/reconcile/queries/installation/details.sql +7 -0
- databricks/labs/lakebridge/resources/reconcile/queries/installation/main.sql +24 -0
- databricks/labs/lakebridge/resources/reconcile/queries/installation/metrics.sql +21 -0
- databricks/labs/lakebridge/transpiler/__init__.py +0 -0
- databricks/labs/lakebridge/transpiler/execute.py +423 -0
- databricks/labs/lakebridge/transpiler/lsp/__init__.py +0 -0
- databricks/labs/lakebridge/transpiler/lsp/lsp_engine.py +564 -0
- databricks/labs/lakebridge/transpiler/sqlglot/__init__.py +0 -0
- databricks/labs/lakebridge/transpiler/sqlglot/dialect_utils.py +30 -0
- databricks/labs/lakebridge/transpiler/sqlglot/generator/__init__.py +0 -0
- databricks/labs/lakebridge/transpiler/sqlglot/generator/databricks.py +771 -0
- databricks/labs/lakebridge/transpiler/sqlglot/lca_utils.py +138 -0
- databricks/labs/lakebridge/transpiler/sqlglot/local_expression.py +197 -0
- databricks/labs/lakebridge/transpiler/sqlglot/parsers/__init__.py +0 -0
- databricks/labs/lakebridge/transpiler/sqlglot/parsers/oracle.py +23 -0
- databricks/labs/lakebridge/transpiler/sqlglot/parsers/presto.py +202 -0
- databricks/labs/lakebridge/transpiler/sqlglot/parsers/snowflake.py +535 -0
- databricks/labs/lakebridge/transpiler/sqlglot/sqlglot_engine.py +203 -0
- databricks/labs/lakebridge/transpiler/transpile_engine.py +49 -0
- databricks/labs/lakebridge/transpiler/transpile_status.py +68 -0
- databricks/labs/lakebridge/uninstall.py +28 -0
- databricks/labs/lakebridge/upgrades/v0.4.0_add_main_table_operation_name_column.py +80 -0
- databricks/labs/lakebridge/upgrades/v0.6.0_alter_metrics_datatype.py +51 -0
- databricks_labs_lakebridge-0.10.0.dist-info/METADATA +58 -0
- databricks_labs_lakebridge-0.10.0.dist-info/RECORD +171 -0
- databricks_labs_lakebridge-0.10.0.dist-info/WHEEL +4 -0
- databricks_labs_lakebridge-0.10.0.dist-info/entry_points.txt +2 -0
- databricks_labs_lakebridge-0.10.0.dist-info/licenses/LICENSE +69 -0
- databricks_labs_lakebridge-0.10.0.dist-info/licenses/NOTICE +42 -0
- docs/lakebridge/src/components/Button.tsx +81 -0
- docs/lakebridge/src/css/custom.css +167 -0
- docs/lakebridge/src/css/table.css +20 -0
- docs/lakebridge/src/pages/index.tsx +57 -0
- docs/lakebridge/src/theme/Footer/index.tsx +24 -0
- docs/lakebridge/src/theme/Layout/index.tsx +18 -0
@@ -0,0 +1,239 @@
|
|
1
|
+
import logging
|
2
|
+
from abc import ABC, abstractmethod
|
3
|
+
|
4
|
+
from pyspark.sql import DataFrame
|
5
|
+
from pyspark.sql import functions as F
|
6
|
+
|
7
|
+
from databricks.labs.lakebridge.reconcile.constants import SamplingOptionMethod, SamplingSpecificationsType
|
8
|
+
from databricks.labs.lakebridge.reconcile.recon_config import SamplingOptions, SamplingSpecifications
|
9
|
+
|
10
|
+
logger = logging.getLogger(__name__)
|
11
|
+
|
12
|
+
_MIN_SAMPLE_COUNT = 50
|
13
|
+
_MAX_SAMPLE_COUNT = 400
|
14
|
+
|
15
|
+
_MIN_BUCKET_LIMIT = 2
|
16
|
+
_MAX_BUCKET_LIMIT = 50
|
17
|
+
|
18
|
+
|
19
|
+
class Sampler(ABC):
|
20
|
+
def __init__(self, sampling_options: SamplingOptions):
|
21
|
+
self._sampling_options = sampling_options
|
22
|
+
|
23
|
+
@abstractmethod
|
24
|
+
def _validate_sampling_options(self):
|
25
|
+
return NotImplemented
|
26
|
+
|
27
|
+
@abstractmethod
|
28
|
+
def sample(
|
29
|
+
self,
|
30
|
+
keys_df: DataFrame,
|
31
|
+
keys_df_count: int,
|
32
|
+
key_columns: list[str],
|
33
|
+
target_table: DataFrame,
|
34
|
+
) -> DataFrame:
|
35
|
+
return NotImplemented
|
36
|
+
|
37
|
+
|
38
|
+
class RandomSampler(Sampler):
|
39
|
+
"""
|
40
|
+
RandomSampler is a subclass of Sampler that performs random sampling on a given DataFrame.
|
41
|
+
"""
|
42
|
+
|
43
|
+
def __init__(self, sampling_options: SamplingOptions, seed: int = 100):
|
44
|
+
"""
|
45
|
+
Attributes:
|
46
|
+
sampling_options (SamplingOptions): The options for sampling, including method, specifications.
|
47
|
+
seed (int): The seed for random sampling to ensure reproducibility.
|
48
|
+
"""
|
49
|
+
super().__init__(sampling_options)
|
50
|
+
self.seed = seed
|
51
|
+
|
52
|
+
def _validate_sampling_options(self):
|
53
|
+
if self._sampling_options.method != SamplingOptionMethod.RANDOM:
|
54
|
+
raise ValueError("RandomSampler: Only 'random' sampling method is supported")
|
55
|
+
|
56
|
+
specs = self._sampling_options.specifications
|
57
|
+
if specs.type == SamplingSpecificationsType.COUNT and (
|
58
|
+
specs.value is None or (specs.value < _MIN_SAMPLE_COUNT)
|
59
|
+
):
|
60
|
+
logger.info(
|
61
|
+
f"RandomSampler: Sample count must be >= {_MIN_SAMPLE_COUNT}, " f"flooring to {_MIN_SAMPLE_COUNT}"
|
62
|
+
)
|
63
|
+
self._sampling_options.specifications.value = _MIN_SAMPLE_COUNT
|
64
|
+
|
65
|
+
elif specs.type == SamplingSpecificationsType.COUNT and specs.value > _MAX_SAMPLE_COUNT:
|
66
|
+
logger.info(
|
67
|
+
f"RandomSampler: Sample count must be <= {_MAX_SAMPLE_COUNT}, " f"capping to {_MAX_SAMPLE_COUNT}"
|
68
|
+
)
|
69
|
+
self._sampling_options.specifications.value = _MAX_SAMPLE_COUNT
|
70
|
+
|
71
|
+
def sample(
|
72
|
+
self, keys_df: DataFrame, keys_df_count: int, key_columns: list[str], target_table: DataFrame
|
73
|
+
) -> DataFrame:
|
74
|
+
"""
|
75
|
+
Performs random sampling on the given DataFrame based on the specified options.
|
76
|
+
- Validates the sampling options.
|
77
|
+
- Uses pre-calculated `keys_df_count` from `reconcile_output.mismatch_count` to avoid from recomputing `keys_df`.
|
78
|
+
- If the specifications type is FRACTION, samples the DataFrame based on the fraction value.
|
79
|
+
- If the specifications type is COUNT, calculates the fraction and samples the DataFrame accordingly,
|
80
|
+
then limits the sample size to the specified count.
|
81
|
+
- Returns the sampled DataFrame.
|
82
|
+
"""
|
83
|
+
|
84
|
+
self._validate_sampling_options()
|
85
|
+
specs = self._sampling_options.specifications
|
86
|
+
|
87
|
+
default_sampled_df = keys_df.limit(_MIN_SAMPLE_COUNT)
|
88
|
+
|
89
|
+
if specs.type == SamplingSpecificationsType.FRACTION:
|
90
|
+
sampled_df = keys_df.sample(fraction=specs.value, seed=self.seed)
|
91
|
+
elif specs.type == SamplingSpecificationsType.COUNT:
|
92
|
+
total_count = keys_df_count
|
93
|
+
sample_size = int(specs.value)
|
94
|
+
fraction = min(1.0, sample_size / total_count)
|
95
|
+
sampled_df = keys_df.sample(fraction=fraction, seed=self.seed).limit(sample_size)
|
96
|
+
else:
|
97
|
+
return default_sampled_df
|
98
|
+
|
99
|
+
return sampled_df
|
100
|
+
|
101
|
+
|
102
|
+
class StratifiedSampler(Sampler):
|
103
|
+
"""
|
104
|
+
StratifiedSampler is a subclass of Sampler that performs stratified sampling on a given DataFrame.
|
105
|
+
"""
|
106
|
+
|
107
|
+
def __init__(self, sampling_options: SamplingOptions, seed: int = 100):
|
108
|
+
"""
|
109
|
+
Attributes:
|
110
|
+
sampling_options (SamplingOptions): The options for sampling, including method, specifications,
|
111
|
+
stratified columns, and stratified buckets.
|
112
|
+
seed (int): The seed for random sampling to ensure reproducibility.
|
113
|
+
"""
|
114
|
+
super().__init__(sampling_options)
|
115
|
+
self.seed = seed
|
116
|
+
|
117
|
+
def _validate_sampling_options(self):
|
118
|
+
if self._sampling_options.method != SamplingOptionMethod.STRATIFIED:
|
119
|
+
raise ValueError("StratifiedSampler: Only 'stratified' sampling method is supported")
|
120
|
+
|
121
|
+
specs = self._sampling_options.specifications
|
122
|
+
stratified_buckets = self._sampling_options.stratified_buckets
|
123
|
+
|
124
|
+
if specs.type == SamplingSpecificationsType.COUNT and (
|
125
|
+
specs.value is None or (specs.value < _MIN_SAMPLE_COUNT)
|
126
|
+
):
|
127
|
+
logger.info(
|
128
|
+
f"StratifiedSampler: Sample count must be >= {_MIN_SAMPLE_COUNT}, " f"flooring to {_MIN_SAMPLE_COUNT}"
|
129
|
+
)
|
130
|
+
self._sampling_options.specifications.value = _MIN_SAMPLE_COUNT
|
131
|
+
|
132
|
+
elif specs.type == SamplingSpecificationsType.COUNT and specs.value > _MAX_SAMPLE_COUNT:
|
133
|
+
logger.info(
|
134
|
+
f"StratifiedSampler: Sample count must be <= {_MAX_SAMPLE_COUNT}, " f"capping to {_MAX_SAMPLE_COUNT}"
|
135
|
+
)
|
136
|
+
self._sampling_options.specifications.value = _MAX_SAMPLE_COUNT
|
137
|
+
|
138
|
+
if stratified_buckets < _MIN_BUCKET_LIMIT:
|
139
|
+
logger.info(
|
140
|
+
f"StratifiedSampler: Stratified buckets must be >= {_MIN_BUCKET_LIMIT}, "
|
141
|
+
f"flooring to {_MIN_BUCKET_LIMIT}"
|
142
|
+
)
|
143
|
+
self._sampling_options.stratified_buckets = _MIN_BUCKET_LIMIT
|
144
|
+
elif stratified_buckets > _MAX_BUCKET_LIMIT:
|
145
|
+
logger.info(
|
146
|
+
f"StratifiedSampler: Stratified buckets must be <= {_MAX_BUCKET_LIMIT}, "
|
147
|
+
f"capping to {_MAX_BUCKET_LIMIT}"
|
148
|
+
)
|
149
|
+
self._sampling_options.stratified_buckets = _MAX_BUCKET_LIMIT
|
150
|
+
|
151
|
+
def sample(
|
152
|
+
self, keys_df: DataFrame, keys_df_count: int, key_columns: list[str], target_table: DataFrame
|
153
|
+
) -> DataFrame:
|
154
|
+
"""
|
155
|
+
Performs stratified sampling on the given DataFrame based on the specified options.
|
156
|
+
- Joins the keys_df with the target_table on the key_columns.
|
157
|
+
- Creates a hash bucket column based on the stratified columns.
|
158
|
+
- Adds the bucket column to the joined DataFrame.
|
159
|
+
- If specifications type is FRACTION, calculates fractions for each bucket and samples accordingly.
|
160
|
+
- If specifications type is COUNT, calculates fractions for each bucket and samples accordingly,
|
161
|
+
then limits the sample size to the specified count.
|
162
|
+
- Returns the sampled DataFrame.
|
163
|
+
"""
|
164
|
+
self._validate_sampling_options()
|
165
|
+
|
166
|
+
specs = self._sampling_options.specifications
|
167
|
+
stratified_columns = self._sampling_options.stratified_columns
|
168
|
+
non_key_stratified_columns = [
|
169
|
+
col for col in (self._sampling_options.stratified_columns or []) if col not in key_columns
|
170
|
+
]
|
171
|
+
stratified_buckets = self._sampling_options.stratified_buckets or _MIN_BUCKET_LIMIT
|
172
|
+
|
173
|
+
keys_df.select(*key_columns)
|
174
|
+
default_sampled_df = keys_df.limit(_MIN_SAMPLE_COUNT)
|
175
|
+
|
176
|
+
# Join the mismatched_df with target_table_df
|
177
|
+
joined_df = keys_df.join(
|
178
|
+
target_table, [keys_df[col] == target_table[col] for col in key_columns], "inner"
|
179
|
+
).select(*[keys_df[col] for col in key_columns], *[target_table[col] for col in non_key_stratified_columns])
|
180
|
+
|
181
|
+
# Create a hash bucket column based on the stratified columns
|
182
|
+
bucket_col = F.pmod(F.abs(F.hash(*[F.col(c) for c in (stratified_columns or [])])), stratified_buckets)
|
183
|
+
|
184
|
+
# Add the bucket column to the joined_df
|
185
|
+
bucketed_df = joined_df.withColumn("bucket", bucket_col)
|
186
|
+
|
187
|
+
if specs.type == SamplingSpecificationsType.FRACTION:
|
188
|
+
# Calculate fractions for each bucket
|
189
|
+
unique_values = bucketed_df.select("bucket").distinct().collect()
|
190
|
+
fractions = {row["bucket"]: specs.value for row in unique_values}
|
191
|
+
sampled_df = bucketed_df.sampleBy("bucket", fractions=fractions, seed=self.seed)
|
192
|
+
|
193
|
+
elif specs.type == SamplingSpecificationsType.COUNT:
|
194
|
+
# Calculate fractions for each bucket
|
195
|
+
sample_size = int(specs.value)
|
196
|
+
bucket_counts = bucketed_df.groupBy("bucket").count().collect()
|
197
|
+
total_count = sum(row['count'] for row in bucket_counts)
|
198
|
+
fractions = {
|
199
|
+
row["bucket"]: min(1.0, (sample_size * row['count'] / total_count) / row['count'])
|
200
|
+
for row in bucket_counts
|
201
|
+
}
|
202
|
+
sampled_df = bucketed_df.sampleBy("bucket", fractions=fractions, seed=self.seed).limit(sample_size)
|
203
|
+
|
204
|
+
else:
|
205
|
+
return default_sampled_df.select(*key_columns)
|
206
|
+
|
207
|
+
return sampled_df.select(*key_columns)
|
208
|
+
|
209
|
+
|
210
|
+
# TODO: Move away from SamplerFactory to a context-driven approach
|
211
|
+
class SamplerFactory:
|
212
|
+
@staticmethod
|
213
|
+
def get_sampler(sampling_options: SamplingOptions, seed: int = 100) -> Sampler:
|
214
|
+
# If no sampling options provided, use default
|
215
|
+
if sampling_options is None:
|
216
|
+
default_sampling_options = SamplingOptions(
|
217
|
+
method=SamplingOptionMethod.RANDOM,
|
218
|
+
specifications=SamplingSpecifications(type=SamplingSpecificationsType.COUNT, value=_MIN_SAMPLE_COUNT),
|
219
|
+
stratified_columns=None,
|
220
|
+
stratified_buckets=None,
|
221
|
+
)
|
222
|
+
logger.info(
|
223
|
+
f"SamplerFactory: No sampling options provided, using default options: " f"{default_sampling_options}"
|
224
|
+
)
|
225
|
+
sampling_options = default_sampling_options
|
226
|
+
|
227
|
+
else:
|
228
|
+
logger.info(f"SamplerFactory: Creating sampler using provided options: " f"{sampling_options}")
|
229
|
+
|
230
|
+
# Use a dictionary-based dispatch for better extensibility
|
231
|
+
sampler_map = {SamplingOptionMethod.RANDOM: RandomSampler, SamplingOptionMethod.STRATIFIED: StratifiedSampler}
|
232
|
+
|
233
|
+
# Get the sampler class
|
234
|
+
sampler_class = sampler_map.get(sampling_options.method)
|
235
|
+
|
236
|
+
if sampler_class is None:
|
237
|
+
raise ValueError(f"SamplerFactory : Unsupported sampling method: {sampling_options.method}")
|
238
|
+
|
239
|
+
return sampler_class(sampling_options, seed)
|
@@ -0,0 +1,126 @@
|
|
1
|
+
import logging
|
2
|
+
from dataclasses import asdict
|
3
|
+
|
4
|
+
from pyspark.sql import DataFrame, SparkSession
|
5
|
+
from pyspark.sql.types import BooleanType, StringType, StructField, StructType
|
6
|
+
from sqlglot import Dialect, parse_one
|
7
|
+
|
8
|
+
from databricks.labs.lakebridge.transpiler.sqlglot.dialect_utils import get_dialect
|
9
|
+
from databricks.labs.lakebridge.reconcile.recon_config import Schema, Table
|
10
|
+
from databricks.labs.lakebridge.reconcile.recon_output_config import SchemaMatchResult, SchemaReconcileOutput
|
11
|
+
from databricks.labs.lakebridge.transpiler.sqlglot.generator.databricks import Databricks
|
12
|
+
|
13
|
+
logger = logging.getLogger(__name__)
|
14
|
+
|
15
|
+
|
16
|
+
class SchemaCompare:
|
17
|
+
def __init__(
|
18
|
+
self,
|
19
|
+
spark: SparkSession,
|
20
|
+
):
|
21
|
+
self.spark = spark
|
22
|
+
|
23
|
+
# Define the schema for the schema compare DataFrame
|
24
|
+
_schema_compare_schema: StructType = StructType(
|
25
|
+
[
|
26
|
+
StructField("source_column", StringType(), False),
|
27
|
+
StructField("source_datatype", StringType(), False),
|
28
|
+
StructField("databricks_column", StringType(), True),
|
29
|
+
StructField("databricks_datatype", StringType(), True),
|
30
|
+
StructField("is_valid", BooleanType(), False),
|
31
|
+
]
|
32
|
+
)
|
33
|
+
|
34
|
+
@classmethod
|
35
|
+
def _build_master_schema(
|
36
|
+
cls,
|
37
|
+
source_schema: list[Schema],
|
38
|
+
databricks_schema: list[Schema],
|
39
|
+
table_conf: Table,
|
40
|
+
) -> list[SchemaMatchResult]:
|
41
|
+
master_schema = source_schema
|
42
|
+
if table_conf.select_columns:
|
43
|
+
master_schema = [schema for schema in master_schema if schema.column_name in table_conf.select_columns]
|
44
|
+
if table_conf.drop_columns:
|
45
|
+
master_schema = [sschema for sschema in master_schema if sschema.column_name not in table_conf.drop_columns]
|
46
|
+
|
47
|
+
target_column_map = table_conf.to_src_col_map or {}
|
48
|
+
master_schema_match_res = [
|
49
|
+
SchemaMatchResult(
|
50
|
+
source_column=s.column_name,
|
51
|
+
databricks_column=target_column_map.get(s.column_name, s.column_name),
|
52
|
+
source_datatype=s.data_type,
|
53
|
+
databricks_datatype=next(
|
54
|
+
(
|
55
|
+
tgt.data_type
|
56
|
+
for tgt in databricks_schema
|
57
|
+
if tgt.column_name == target_column_map.get(s.column_name, s.column_name)
|
58
|
+
),
|
59
|
+
"",
|
60
|
+
),
|
61
|
+
)
|
62
|
+
for s in master_schema
|
63
|
+
]
|
64
|
+
return master_schema_match_res
|
65
|
+
|
66
|
+
def _create_dataframe(self, data: list, schema: StructType) -> DataFrame:
|
67
|
+
"""
|
68
|
+
:param data: Expectation is list of dataclass
|
69
|
+
:param schema: Target schema
|
70
|
+
:return: DataFrame
|
71
|
+
"""
|
72
|
+
data = [tuple(asdict(item).values()) for item in data]
|
73
|
+
df = self.spark.createDataFrame(data, schema)
|
74
|
+
|
75
|
+
return df
|
76
|
+
|
77
|
+
@classmethod
|
78
|
+
def _parse(cls, source: Dialect, column: str, data_type: str) -> str:
|
79
|
+
return (
|
80
|
+
parse_one(f"create table dummy ({column} {data_type})", read=source)
|
81
|
+
.sql(dialect=get_dialect("databricks"))
|
82
|
+
.replace(", ", ",")
|
83
|
+
)
|
84
|
+
|
85
|
+
@classmethod
|
86
|
+
def _table_schema_status(cls, schema_compare_maps: list[SchemaMatchResult]) -> bool:
|
87
|
+
return bool(all(x.is_valid for x in schema_compare_maps))
|
88
|
+
|
89
|
+
@classmethod
|
90
|
+
def _validate_parsed_query(cls, master: SchemaMatchResult, parsed_query) -> None:
|
91
|
+
databricks_query = f"create table dummy ({master.source_column} {master.databricks_datatype})"
|
92
|
+
logger.info(
|
93
|
+
f"""
|
94
|
+
Source datatype: create table dummy ({master.source_column} {master.source_datatype})
|
95
|
+
Parse datatype: {parsed_query}
|
96
|
+
Databricks datatype: {databricks_query}
|
97
|
+
"""
|
98
|
+
)
|
99
|
+
if parsed_query.lower() != databricks_query.lower():
|
100
|
+
master.is_valid = False
|
101
|
+
|
102
|
+
def compare(
|
103
|
+
self,
|
104
|
+
source_schema: list[Schema],
|
105
|
+
databricks_schema: list[Schema],
|
106
|
+
source: Dialect,
|
107
|
+
table_conf: Table,
|
108
|
+
) -> SchemaReconcileOutput:
|
109
|
+
"""
|
110
|
+
This method compares the source schema and the Databricks schema. It checks if the data types of the columns in the source schema
|
111
|
+
match with the corresponding columns in the Databricks schema by parsing using remorph transpile.
|
112
|
+
|
113
|
+
Returns:
|
114
|
+
SchemaReconcileOutput: A dataclass object containing a boolean indicating the overall result of the comparison and a DataFrame with the comparison details.
|
115
|
+
"""
|
116
|
+
master_schema = self._build_master_schema(source_schema, databricks_schema, table_conf)
|
117
|
+
for master in master_schema:
|
118
|
+
if not isinstance(source, Databricks):
|
119
|
+
parsed_query = self._parse(source, master.source_column, master.source_datatype)
|
120
|
+
self._validate_parsed_query(master, parsed_query)
|
121
|
+
elif master.source_datatype.lower() != master.databricks_datatype.lower():
|
122
|
+
master.is_valid = False
|
123
|
+
|
124
|
+
df = self._create_dataframe(master_schema, self._schema_compare_schema)
|
125
|
+
final_result = self._table_schema_status(master_schema)
|
126
|
+
return SchemaReconcileOutput(final_result, df)
|
File without changes
|
@@ -0,0 +1,33 @@
|
|
1
|
+
secret_vault_type: local | databricks | env
|
2
|
+
secret_vault_name: null
|
3
|
+
snowflake:
|
4
|
+
account: example_account
|
5
|
+
connect_retries: 1
|
6
|
+
connect_timeout: null
|
7
|
+
host: null
|
8
|
+
insecure_mode: false
|
9
|
+
oauth_client_id: null
|
10
|
+
oauth_client_secret: null
|
11
|
+
password: null
|
12
|
+
port: null
|
13
|
+
private_key: null
|
14
|
+
private_key_passphrase: null
|
15
|
+
private_key_path: null
|
16
|
+
role: null
|
17
|
+
token: null
|
18
|
+
user: null
|
19
|
+
warehouse: null
|
20
|
+
|
21
|
+
mssql:
|
22
|
+
#TODO Expand to support sqlpools, and legacy dwh
|
23
|
+
database: DB_NAME
|
24
|
+
driver: ODBC Driver 18 for SQL Server
|
25
|
+
server: example_host
|
26
|
+
port: null
|
27
|
+
user: null
|
28
|
+
password: null
|
29
|
+
|
30
|
+
|
31
|
+
|
32
|
+
|
33
|
+
|
File without changes
|
File without changes
|
@@ -0,0 +1,46 @@
|
|
1
|
+
/* --title 'Aggregates Summary Table' --width 6 --height 6 */
|
2
|
+
SELECT
|
3
|
+
main.recon_id,
|
4
|
+
main.source_type,
|
5
|
+
main.source_table.`catalog` AS source_catalog,
|
6
|
+
main.source_table.`schema` AS source_schema,
|
7
|
+
main.source_table.table_name AS source_table_name,
|
8
|
+
IF(
|
9
|
+
ISNULL(source_catalog),
|
10
|
+
CONCAT_WS('.', source_schema, source_table_name),
|
11
|
+
CONCAT_WS(
|
12
|
+
'.',
|
13
|
+
source_catalog,
|
14
|
+
source_schema,
|
15
|
+
source_table_name
|
16
|
+
)
|
17
|
+
) AS source_table,
|
18
|
+
main.target_table.`catalog` AS target_catalog,
|
19
|
+
main.target_table.`schema` AS target_schema,
|
20
|
+
main.target_table.table_name AS target_table_name,
|
21
|
+
CONCAT_WS(
|
22
|
+
'.',
|
23
|
+
target_catalog,
|
24
|
+
target_schema,
|
25
|
+
target_table_name
|
26
|
+
) AS target_table,
|
27
|
+
UPPER(rules.rule_info.agg_type) || CONCAT('(', rules.rule_info.agg_column, ')') AS aggregate_column,
|
28
|
+
rules.rule_info.group_by_columns,
|
29
|
+
metrics.run_metrics.status AS status,
|
30
|
+
metrics.run_metrics.exception_message AS exception,
|
31
|
+
metrics.recon_metrics.missing_in_source AS missing_in_source,
|
32
|
+
metrics.recon_metrics.missing_in_target AS missing_in_target,
|
33
|
+
metrics.recon_metrics.mismatch AS mismatch,
|
34
|
+
metrics.run_metrics.run_by_user AS executed_by,
|
35
|
+
main.start_ts AS start_ts,
|
36
|
+
main.end_ts AS end_ts
|
37
|
+
FROM
|
38
|
+
remorph.reconcile.main main
|
39
|
+
INNER JOIN remorph.reconcile.aggregate_metrics metrics
|
40
|
+
INNER JOIN remorph.reconcile.aggregate_rules rules
|
41
|
+
ON main.recon_table_id = metrics.recon_table_id
|
42
|
+
AND rules.rule_id = metrics.rule_id
|
43
|
+
ORDER BY
|
44
|
+
metrics.inserted_ts DESC,
|
45
|
+
main.recon_id,
|
46
|
+
main.target_table.table_name
|
@@ -0,0 +1,92 @@
|
|
1
|
+
/* --title 'Aggregates Reconciliation Details' --width 6 --height 6 */
|
2
|
+
WITH details_view AS (
|
3
|
+
SELECT
|
4
|
+
recon_table_id,
|
5
|
+
rule_id,
|
6
|
+
recon_type,
|
7
|
+
explode(data) AS agg_details
|
8
|
+
FROM
|
9
|
+
remorph.reconcile.aggregate_details
|
10
|
+
),
|
11
|
+
metrics_view AS (
|
12
|
+
SELECT
|
13
|
+
recon_table_id,
|
14
|
+
rule_id,
|
15
|
+
recon_metrics,
|
16
|
+
run_metrics
|
17
|
+
FROM
|
18
|
+
remorph.reconcile.aggregate_metrics
|
19
|
+
)
|
20
|
+
SELECT
|
21
|
+
recon_id AS dd_recon_id,
|
22
|
+
source_table AS dd_source_table,
|
23
|
+
target_table AS dd_target_table,
|
24
|
+
recon_type AS dd_recon_type,
|
25
|
+
aggregate_type AS dd_aggregate_type,
|
26
|
+
rule AS aggregate_column,
|
27
|
+
source_value,
|
28
|
+
target_value,
|
29
|
+
zip_with(rule_group_by_columns, group_by_column_values, (groupby, value) -> CONCAT_WS(':', TRIM(groupby), value)) AS group_by_columns,
|
30
|
+
COALESCE(status, 'false') AS status
|
31
|
+
FROM (
|
32
|
+
SELECT
|
33
|
+
main.recon_id,
|
34
|
+
main.source_table.`catalog` AS source_catalog,
|
35
|
+
main.source_table.`schema` AS source_schema,
|
36
|
+
main.source_table.table_name AS source_table_name,
|
37
|
+
IF(
|
38
|
+
ISNULL(source_catalog),
|
39
|
+
CONCAT_WS('.', source_schema, source_table_name),
|
40
|
+
CONCAT_WS(
|
41
|
+
'.',
|
42
|
+
source_catalog,
|
43
|
+
source_schema,
|
44
|
+
source_table_name
|
45
|
+
)
|
46
|
+
) AS source_table,
|
47
|
+
main.target_table.`catalog` AS target_catalog,
|
48
|
+
main.target_table.`schema` AS target_schema,
|
49
|
+
main.target_table.table_name AS target_table_name,
|
50
|
+
CONCAT_WS(
|
51
|
+
'.',
|
52
|
+
target_catalog,
|
53
|
+
target_schema,
|
54
|
+
target_table_name
|
55
|
+
) AS target_table,
|
56
|
+
dtl.recon_type,
|
57
|
+
rul.rule_info.agg_type AS aggregate_type,
|
58
|
+
UPPER(rul.rule_info.agg_type) || CONCAT('(', rul.rule_info.agg_column, ')') AS rule,
|
59
|
+
CONCAT_WS(
|
60
|
+
'_',
|
61
|
+
'source',
|
62
|
+
rul.rule_info.agg_type,
|
63
|
+
rul.rule_info.agg_column
|
64
|
+
) AS source_agg_column,
|
65
|
+
dtl.agg_details[source_agg_column] AS source_value,
|
66
|
+
CONCAT_WS(
|
67
|
+
'_',
|
68
|
+
'target',
|
69
|
+
rul.rule_info.agg_type,
|
70
|
+
rul.rule_info.agg_column
|
71
|
+
) AS target_agg_column,
|
72
|
+
dtl.agg_details[target_agg_column] AS target_value,
|
73
|
+
SPLIT(rul.rule_info.group_by_columns, ',') AS rule_group_by_columns,
|
74
|
+
TRANSFORM(rule_group_by_columns, colm ->
|
75
|
+
COALESCE(dtl.agg_details[CONCAT('source_group_by_', TRIM(colm))],
|
76
|
+
dtl.agg_details[CONCAT('target_group_by_', TRIM(colm))])) AS group_by_column_values,
|
77
|
+
CONCAT_WS(
|
78
|
+
'_',
|
79
|
+
'match',
|
80
|
+
rul.rule_info.agg_type,
|
81
|
+
rul.rule_info.agg_column
|
82
|
+
) AS status_column,
|
83
|
+
dtl.agg_details[status_column] AS status
|
84
|
+
FROM
|
85
|
+
metrics_view mtc
|
86
|
+
INNER JOIN remorph.reconcile.main main ON main.recon_table_id = mtc.recon_table_id
|
87
|
+
INNER JOIN details_view dtl ON mtc.recon_table_id = dtl.recon_table_id
|
88
|
+
INNER JOIN remorph.reconcile.aggregate_rules rul ON mtc.rule_id = dtl.rule_id
|
89
|
+
AND dtl.rule_id = rul.rule_id
|
90
|
+
)
|
91
|
+
ORDER BY
|
92
|
+
recon_id
|
@@ -0,0 +1 @@
|
|
1
|
+
# Visualization of Missing and Mismatched Records
|
@@ -0,0 +1,19 @@
|
|
1
|
+
/* --title 'Mismatched Records' --width 6 */
|
2
|
+
SELECT
|
3
|
+
main.recon_id,
|
4
|
+
CONCAT_WS(
|
5
|
+
'.',
|
6
|
+
main.target_table.`catalog`,
|
7
|
+
main.target_table.`schema`,
|
8
|
+
main.target_table.table_name
|
9
|
+
) AS target_table,
|
10
|
+
main.start_ts,
|
11
|
+
metrics.recon_metrics.mismatch AS mismatch
|
12
|
+
FROM
|
13
|
+
remorph.reconcile.main main
|
14
|
+
INNER JOIN remorph.reconcile.aggregate_metrics metrics
|
15
|
+
ON main.recon_table_id = metrics.recon_table_id
|
16
|
+
ORDER BY
|
17
|
+
metrics.inserted_ts DESC,
|
18
|
+
main.recon_id,
|
19
|
+
main.target_table.table_name
|