databricks-labs-lakebridge 0.10.7__py3-none-any.whl → 0.10.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- databricks/labs/lakebridge/__about__.py +1 -1
- databricks/labs/lakebridge/assessments/profiler_validator.py +103 -0
- databricks/labs/lakebridge/base_install.py +1 -5
- databricks/labs/lakebridge/cli.py +13 -6
- databricks/labs/lakebridge/helpers/validation.py +5 -3
- databricks/labs/lakebridge/install.py +40 -481
- databricks/labs/lakebridge/reconcile/connectors/data_source.py +9 -5
- databricks/labs/lakebridge/reconcile/connectors/databricks.py +2 -1
- databricks/labs/lakebridge/reconcile/connectors/oracle.py +2 -1
- databricks/labs/lakebridge/reconcile/connectors/secrets.py +19 -1
- databricks/labs/lakebridge/reconcile/connectors/snowflake.py +50 -29
- databricks/labs/lakebridge/reconcile/connectors/tsql.py +2 -1
- databricks/labs/lakebridge/reconcile/query_builder/base.py +50 -11
- databricks/labs/lakebridge/reconcile/query_builder/expression_generator.py +8 -2
- databricks/labs/lakebridge/reconcile/query_builder/hash_query.py +7 -13
- databricks/labs/lakebridge/reconcile/query_builder/sampling_query.py +18 -19
- databricks/labs/lakebridge/reconcile/query_builder/threshold_query.py +36 -15
- databricks/labs/lakebridge/reconcile/recon_config.py +0 -15
- databricks/labs/lakebridge/reconcile/reconciliation.py +4 -1
- databricks/labs/lakebridge/reconcile/trigger_recon_aggregate_service.py +11 -31
- databricks/labs/lakebridge/reconcile/trigger_recon_service.py +4 -1
- databricks/labs/lakebridge/transpiler/execute.py +34 -28
- databricks/labs/lakebridge/transpiler/installers.py +523 -0
- databricks/labs/lakebridge/transpiler/lsp/lsp_engine.py +2 -0
- {databricks_labs_lakebridge-0.10.7.dist-info → databricks_labs_lakebridge-0.10.8.dist-info}/METADATA +1 -1
- {databricks_labs_lakebridge-0.10.7.dist-info → databricks_labs_lakebridge-0.10.8.dist-info}/RECORD +30 -28
- {databricks_labs_lakebridge-0.10.7.dist-info → databricks_labs_lakebridge-0.10.8.dist-info}/WHEEL +0 -0
- {databricks_labs_lakebridge-0.10.7.dist-info → databricks_labs_lakebridge-0.10.8.dist-info}/entry_points.txt +0 -0
- {databricks_labs_lakebridge-0.10.7.dist-info → databricks_labs_lakebridge-0.10.8.dist-info}/licenses/LICENSE +0 -0
- {databricks_labs_lakebridge-0.10.7.dist-info → databricks_labs_lakebridge-0.10.8.dist-info}/licenses/NOTICE +0 -0
@@ -10,15 +10,13 @@ from databricks.labs.lakebridge.reconcile.recon_capture import (
|
|
10
10
|
ReconIntermediatePersist,
|
11
11
|
generate_final_reconcile_aggregate_output,
|
12
12
|
)
|
13
|
-
from databricks.labs.lakebridge.reconcile.recon_config import
|
13
|
+
from databricks.labs.lakebridge.reconcile.recon_config import AGG_RECONCILE_OPERATION_NAME
|
14
14
|
from databricks.labs.lakebridge.reconcile.recon_output_config import (
|
15
15
|
ReconcileProcessDuration,
|
16
16
|
AggregateQueryOutput,
|
17
17
|
DataReconcileOutput,
|
18
18
|
)
|
19
|
-
from databricks.labs.lakebridge.reconcile.reconciliation import Reconciliation
|
20
19
|
from databricks.labs.lakebridge.reconcile.trigger_recon_service import TriggerReconService
|
21
|
-
from databricks.labs.lakebridge.reconcile.normalize_recon_config_service import NormalizeReconConfigService
|
22
20
|
|
23
21
|
|
24
22
|
class TriggerReconAggregateService:
|
@@ -36,42 +34,36 @@ class TriggerReconAggregateService:
|
|
36
34
|
|
37
35
|
# Get the Aggregated Reconciliation Output for each table
|
38
36
|
for table_conf in table_recon.tables:
|
39
|
-
normalized_table_conf = NormalizeReconConfigService(
|
40
|
-
reconciler.source, reconciler.target
|
41
|
-
).normalize_recon_table_config(table_conf)
|
42
|
-
|
43
37
|
recon_process_duration = ReconcileProcessDuration(start_ts=str(datetime.now()), end_ts=None)
|
44
38
|
try:
|
45
39
|
src_schema, tgt_schema = TriggerReconService.get_schemas(
|
46
|
-
reconciler.source, reconciler.target,
|
40
|
+
reconciler.source, reconciler.target, table_conf, reconcile_config.database_config, False
|
47
41
|
)
|
48
42
|
except DataSourceRuntimeException as e:
|
49
43
|
raise ReconciliationException(message=str(e)) from e
|
50
44
|
|
51
|
-
assert
|
45
|
+
assert table_conf.aggregates, "Aggregates must be defined for Aggregates Reconciliation"
|
52
46
|
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
)
|
60
|
-
)
|
47
|
+
try:
|
48
|
+
table_reconcile_agg_output_list = reconciler.reconcile_aggregates(table_conf, src_schema, tgt_schema)
|
49
|
+
except DataSourceRuntimeException as e:
|
50
|
+
table_reconcile_agg_output_list = [
|
51
|
+
AggregateQueryOutput(reconcile_output=DataReconcileOutput(exception=str(e)), rule=None)
|
52
|
+
]
|
61
53
|
|
62
54
|
recon_process_duration.end_ts = str(datetime.now())
|
63
55
|
|
64
56
|
# Persist the data to the delta tables
|
65
57
|
recon_capture.store_aggregates_metrics(
|
66
58
|
reconcile_agg_output_list=table_reconcile_agg_output_list,
|
67
|
-
table_conf=
|
59
|
+
table_conf=table_conf,
|
68
60
|
recon_process_duration=recon_process_duration,
|
69
61
|
)
|
70
62
|
|
71
63
|
(
|
72
64
|
ReconIntermediatePersist(
|
73
65
|
spark=spark,
|
74
|
-
path=utils.generate_volume_path(
|
66
|
+
path=utils.generate_volume_path(table_conf, reconcile_config.metadata_config),
|
75
67
|
).clean_unmatched_df_from_volume()
|
76
68
|
)
|
77
69
|
|
@@ -84,15 +76,3 @@ class TriggerReconAggregateService:
|
|
84
76
|
),
|
85
77
|
operation_name=AGG_RECONCILE_OPERATION_NAME,
|
86
78
|
)
|
87
|
-
|
88
|
-
@staticmethod
|
89
|
-
def _run_reconcile_aggregates(
|
90
|
-
reconciler: Reconciliation,
|
91
|
-
table_conf: Table,
|
92
|
-
src_schema: list[Schema],
|
93
|
-
tgt_schema: list[Schema],
|
94
|
-
) -> list[AggregateQueryOutput]:
|
95
|
-
try:
|
96
|
-
return reconciler.reconcile_aggregates(table_conf, src_schema, tgt_schema)
|
97
|
-
except DataSourceRuntimeException as e:
|
98
|
-
return [AggregateQueryOutput(reconcile_output=DataReconcileOutput(exception=str(e)), rule=None)]
|
@@ -138,7 +138,7 @@ class TriggerReconService:
|
|
138
138
|
|
139
139
|
try:
|
140
140
|
src_schema, tgt_schema = TriggerReconService.get_schemas(
|
141
|
-
reconciler.source, reconciler.target, table_conf, reconcile_config.database_config
|
141
|
+
reconciler.source, reconciler.target, table_conf, reconcile_config.database_config, True
|
142
142
|
)
|
143
143
|
except DataSourceRuntimeException as e:
|
144
144
|
schema_reconcile_output = SchemaReconcileOutput(is_valid=False, exception=str(e))
|
@@ -170,17 +170,20 @@ class TriggerReconService:
|
|
170
170
|
target: DataSource,
|
171
171
|
table_conf: Table,
|
172
172
|
database_config: DatabaseConfig,
|
173
|
+
normalize: bool,
|
173
174
|
) -> tuple[list[Schema], list[Schema]]:
|
174
175
|
src_schema = source.get_schema(
|
175
176
|
catalog=database_config.source_catalog,
|
176
177
|
schema=database_config.source_schema,
|
177
178
|
table=table_conf.source_name,
|
179
|
+
normalize=normalize,
|
178
180
|
)
|
179
181
|
|
180
182
|
tgt_schema = target.get_schema(
|
181
183
|
catalog=database_config.target_catalog,
|
182
184
|
schema=database_config.target_schema,
|
183
185
|
table=table_conf.target_name,
|
186
|
+
normalize=normalize,
|
184
187
|
)
|
185
188
|
|
186
189
|
return src_schema, tgt_schema
|
@@ -48,6 +48,26 @@ class TranspilingContext:
|
|
48
48
|
transpiled_code: str | None = None
|
49
49
|
|
50
50
|
|
51
|
+
def _validate_transpiled_sql(context: TranspilingContext, content: str, error_list: list[TranspileError]) -> str:
|
52
|
+
if context.validator is None:
|
53
|
+
return content
|
54
|
+
validation_result = _validation(context.validator, context.config, str(content))
|
55
|
+
# Potentially expensive, only evaluate if debug is enabled
|
56
|
+
if logger.isEnabledFor(logging.DEBUG):
|
57
|
+
msg = f"Finished validating transpiled code for file: {context.input_path} (result: {validation_result})"
|
58
|
+
logger.debug(msg)
|
59
|
+
if validation_result.exception_msg is not None:
|
60
|
+
error = TranspileError(
|
61
|
+
"VALIDATION_ERROR",
|
62
|
+
ErrorKind.VALIDATION,
|
63
|
+
ErrorSeverity.WARNING,
|
64
|
+
context.input_path,
|
65
|
+
validation_result.exception_msg,
|
66
|
+
)
|
67
|
+
error_list.append(error)
|
68
|
+
return validation_result.validated_sql
|
69
|
+
|
70
|
+
|
51
71
|
async def _process_one_file(context: TranspilingContext) -> tuple[int, list[TranspileError]]:
|
52
72
|
input_path = context.input_path
|
53
73
|
|
@@ -89,29 +109,29 @@ async def _process_one_file(context: TranspilingContext) -> tuple[int, list[Tran
|
|
89
109
|
assert output_path is not None, "Output path must be set in the context"
|
90
110
|
output_path.parent.mkdir(exist_ok=True)
|
91
111
|
|
92
|
-
if
|
93
|
-
|
112
|
+
if _is_mime_result(transpile_result):
|
113
|
+
_process_mime_result(context, error_list)
|
94
114
|
else:
|
95
|
-
|
115
|
+
_process_non_mime_result(context, error_list)
|
96
116
|
|
97
117
|
return transpile_result.success_count, error_list
|
98
118
|
|
99
119
|
|
100
|
-
def
|
120
|
+
def _is_mime_result(result: TranspileResult):
|
101
121
|
return result.transpiled_code.startswith("Content-Type: multipart/mixed; boundary=")
|
102
122
|
|
103
123
|
|
104
|
-
def
|
124
|
+
def _process_mime_result(context: TranspilingContext, error_list: list[TranspileError]) -> None:
|
105
125
|
# TODO error handling
|
106
126
|
# Added policy to process quoted-printable encoded
|
107
127
|
parser = EmailParser(policy=policy.default)
|
108
128
|
transpiled_code: str = cast(str, context.transpiled_code)
|
109
129
|
message: Message = parser.parsestr(transpiled_code)
|
110
130
|
for part in message.walk():
|
111
|
-
_process_combined_part(context, part)
|
131
|
+
_process_combined_part(context, part, error_list)
|
112
132
|
|
113
133
|
|
114
|
-
def _process_combined_part(context: TranspilingContext, part: Message) -> None:
|
134
|
+
def _process_combined_part(context: TranspilingContext, part: Message, error_list: list[TranspileError]) -> None:
|
115
135
|
if part.get_content_type() != "text/plain":
|
116
136
|
return # TODO Need to handle other content types, e.g., text/binary, application/json, etc.
|
117
137
|
filename = part.get_filename()
|
@@ -133,35 +153,21 @@ def _process_combined_part(context: TranspilingContext, part: Message) -> None:
|
|
133
153
|
folder.mkdir(parents=True, exist_ok=True)
|
134
154
|
output = folder / segments[-1]
|
135
155
|
logger.debug(f"Writing output to: {output}")
|
156
|
+
# Only validate if output file has .sql suffix
|
157
|
+
if output.suffix == ".sql":
|
158
|
+
content = _validate_transpiled_sql(context, content, error_list)
|
136
159
|
output.write_text(content)
|
137
160
|
|
138
161
|
|
139
|
-
def
|
162
|
+
def _process_non_mime_result(context: TranspilingContext, error_list: list[TranspileError]) -> None:
|
140
163
|
|
141
164
|
output_code: str = context.transpiled_code or ""
|
165
|
+
output_path = cast(Path, context.output_path)
|
142
166
|
|
143
167
|
if any(err.kind == ErrorKind.PARSING for err in error_list):
|
144
168
|
output_code = context.source_code or ""
|
145
|
-
|
146
|
-
|
147
|
-
logger.debug(f"Validating transpiled code for file: {context.input_path}")
|
148
|
-
validation_result = _validation(context.validator, context.config, str(context.transpiled_code))
|
149
|
-
# Potentially expensive, only evaluate if debug is enabled
|
150
|
-
if logger.isEnabledFor(logging.DEBUG):
|
151
|
-
msg = f"Finished validating transpiled code for file: {context.input_path} (result: {validation_result})"
|
152
|
-
logger.debug(msg)
|
153
|
-
if validation_result.exception_msg is not None:
|
154
|
-
error = TranspileError(
|
155
|
-
"VALIDATION_ERROR",
|
156
|
-
ErrorKind.VALIDATION,
|
157
|
-
ErrorSeverity.WARNING,
|
158
|
-
context.input_path,
|
159
|
-
validation_result.exception_msg,
|
160
|
-
)
|
161
|
-
error_list.append(error)
|
162
|
-
output_code = validation_result.validated_sql
|
163
|
-
|
164
|
-
output_path = cast(Path, context.output_path)
|
169
|
+
elif output_path.suffix == ".sql":
|
170
|
+
output_code = _validate_transpiled_sql(context, output_code, error_list)
|
165
171
|
with output_path.open("w") as w:
|
166
172
|
# The above adds a java-style comment block at the top of the output file
|
167
173
|
# This would break .py or .json outputs so we disable it for now.
|