databricks-labs-lakebridge 0.10.6__py3-none-any.whl → 0.10.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. databricks/labs/lakebridge/__about__.py +1 -1
  2. databricks/labs/lakebridge/analyzer/__init__.py +0 -0
  3. databricks/labs/lakebridge/analyzer/lakebridge_analyzer.py +95 -0
  4. databricks/labs/lakebridge/assessments/profiler_validator.py +103 -0
  5. databricks/labs/lakebridge/base_install.py +20 -3
  6. databricks/labs/lakebridge/cli.py +32 -59
  7. databricks/labs/lakebridge/contexts/application.py +7 -0
  8. databricks/labs/lakebridge/deployment/job.py +2 -2
  9. databricks/labs/lakebridge/helpers/file_utils.py +36 -0
  10. databricks/labs/lakebridge/helpers/validation.py +5 -3
  11. databricks/labs/lakebridge/install.py +73 -484
  12. databricks/labs/lakebridge/reconcile/compare.py +70 -33
  13. databricks/labs/lakebridge/reconcile/connectors/data_source.py +24 -1
  14. databricks/labs/lakebridge/reconcile/connectors/databricks.py +12 -1
  15. databricks/labs/lakebridge/reconcile/connectors/dialect_utils.py +126 -0
  16. databricks/labs/lakebridge/reconcile/connectors/models.py +7 -0
  17. databricks/labs/lakebridge/reconcile/connectors/oracle.py +12 -1
  18. databricks/labs/lakebridge/reconcile/connectors/secrets.py +19 -1
  19. databricks/labs/lakebridge/reconcile/connectors/snowflake.py +63 -30
  20. databricks/labs/lakebridge/reconcile/connectors/tsql.py +28 -2
  21. databricks/labs/lakebridge/reconcile/constants.py +4 -3
  22. databricks/labs/lakebridge/reconcile/execute.py +9 -810
  23. databricks/labs/lakebridge/reconcile/normalize_recon_config_service.py +133 -0
  24. databricks/labs/lakebridge/reconcile/query_builder/base.py +53 -18
  25. databricks/labs/lakebridge/reconcile/query_builder/expression_generator.py +8 -2
  26. databricks/labs/lakebridge/reconcile/query_builder/hash_query.py +7 -13
  27. databricks/labs/lakebridge/reconcile/query_builder/sampling_query.py +18 -19
  28. databricks/labs/lakebridge/reconcile/query_builder/threshold_query.py +36 -15
  29. databricks/labs/lakebridge/reconcile/recon_config.py +3 -15
  30. databricks/labs/lakebridge/reconcile/recon_output_config.py +2 -1
  31. databricks/labs/lakebridge/reconcile/reconciliation.py +511 -0
  32. databricks/labs/lakebridge/reconcile/schema_compare.py +26 -19
  33. databricks/labs/lakebridge/reconcile/trigger_recon_aggregate_service.py +78 -0
  34. databricks/labs/lakebridge/reconcile/trigger_recon_service.py +256 -0
  35. databricks/labs/lakebridge/reconcile/utils.py +38 -0
  36. databricks/labs/lakebridge/transpiler/execute.py +34 -28
  37. databricks/labs/lakebridge/transpiler/installers.py +523 -0
  38. databricks/labs/lakebridge/transpiler/lsp/lsp_engine.py +47 -60
  39. databricks/labs/lakebridge/transpiler/sqlglot/dialect_utils.py +2 -0
  40. databricks/labs/lakebridge/transpiler/transpile_engine.py +0 -18
  41. {databricks_labs_lakebridge-0.10.6.dist-info → databricks_labs_lakebridge-0.10.8.dist-info}/METADATA +1 -1
  42. {databricks_labs_lakebridge-0.10.6.dist-info → databricks_labs_lakebridge-0.10.8.dist-info}/RECORD +46 -35
  43. {databricks_labs_lakebridge-0.10.6.dist-info → databricks_labs_lakebridge-0.10.8.dist-info}/WHEEL +0 -0
  44. {databricks_labs_lakebridge-0.10.6.dist-info → databricks_labs_lakebridge-0.10.8.dist-info}/entry_points.txt +0 -0
  45. {databricks_labs_lakebridge-0.10.6.dist-info → databricks_labs_lakebridge-0.10.8.dist-info}/licenses/LICENSE +0 -0
  46. {databricks_labs_lakebridge-0.10.6.dist-info → databricks_labs_lakebridge-0.10.8.dist-info}/licenses/NOTICE +0 -0
@@ -0,0 +1,256 @@
1
+ import logging
2
+ from datetime import datetime
3
+ from uuid import uuid4
4
+
5
+ from pyspark.errors import PySparkException
6
+ from pyspark.sql import SparkSession
7
+
8
+ from databricks.sdk import WorkspaceClient
9
+
10
+ from databricks.labs.lakebridge.config import ReconcileConfig, TableRecon, DatabaseConfig
11
+ from databricks.labs.lakebridge.reconcile import utils
12
+ from databricks.labs.lakebridge.reconcile.connectors.data_source import DataSource
13
+ from databricks.labs.lakebridge.reconcile.exception import DataSourceRuntimeException, ReconciliationException
14
+ from databricks.labs.lakebridge.reconcile.recon_capture import (
15
+ ReconCapture,
16
+ ReconIntermediatePersist,
17
+ generate_final_reconcile_output,
18
+ )
19
+ from databricks.labs.lakebridge.reconcile.recon_config import Table, Schema
20
+ from databricks.labs.lakebridge.reconcile.recon_output_config import (
21
+ ReconcileOutput,
22
+ ReconcileProcessDuration,
23
+ SchemaReconcileOutput,
24
+ DataReconcileOutput,
25
+ )
26
+ from databricks.labs.lakebridge.reconcile.reconciliation import Reconciliation
27
+ from databricks.labs.lakebridge.reconcile.schema_compare import SchemaCompare
28
+ from databricks.labs.lakebridge.reconcile.normalize_recon_config_service import NormalizeReconConfigService
29
+ from databricks.labs.lakebridge.transpiler.execute import verify_workspace_client
30
+ from databricks.labs.lakebridge.transpiler.sqlglot.dialect_utils import get_dialect
31
+
32
+ logger = logging.getLogger(__name__)
33
+ _RECON_REPORT_TYPES = {"schema", "data", "row", "all", "aggregate"}
34
+
35
+
36
+ class TriggerReconService:
37
+
38
+ @staticmethod
39
+ def trigger_recon(
40
+ ws: WorkspaceClient,
41
+ spark: SparkSession,
42
+ table_recon: TableRecon,
43
+ reconcile_config: ReconcileConfig,
44
+ local_test_run: bool = False,
45
+ ) -> ReconcileOutput:
46
+ reconciler, recon_capture = TriggerReconService.create_recon_dependencies(
47
+ ws, spark, reconcile_config, local_test_run
48
+ )
49
+
50
+ for table_conf in table_recon.tables:
51
+ TriggerReconService.recon_one(spark, reconciler, recon_capture, reconcile_config, table_conf)
52
+
53
+ return TriggerReconService.verify_successful_reconciliation(
54
+ generate_final_reconcile_output(
55
+ recon_id=recon_capture.recon_id,
56
+ spark=spark,
57
+ metadata_config=reconcile_config.metadata_config,
58
+ local_test_run=local_test_run,
59
+ )
60
+ )
61
+
62
+ @staticmethod
63
+ def create_recon_dependencies(
64
+ ws: WorkspaceClient, spark: SparkSession, reconcile_config: ReconcileConfig, local_test_run: bool = False
65
+ ) -> tuple[Reconciliation, ReconCapture]:
66
+ ws_client: WorkspaceClient = verify_workspace_client(ws)
67
+
68
+ # validate the report type
69
+ report_type = reconcile_config.report_type.lower()
70
+ logger.info(f"report_type: {report_type}, data_source: {reconcile_config.data_source} ")
71
+ utils.validate_input(report_type, _RECON_REPORT_TYPES, "Invalid report type")
72
+
73
+ source, target = utils.initialise_data_source(
74
+ engine=reconcile_config.data_source,
75
+ spark=spark,
76
+ ws=ws_client,
77
+ secret_scope=reconcile_config.secret_scope,
78
+ )
79
+
80
+ recon_id = str(uuid4())
81
+ # initialise the Reconciliation
82
+ reconciler = Reconciliation(
83
+ source,
84
+ target,
85
+ reconcile_config.database_config,
86
+ report_type,
87
+ SchemaCompare(spark=spark),
88
+ get_dialect(reconcile_config.data_source),
89
+ spark,
90
+ metadata_config=reconcile_config.metadata_config,
91
+ )
92
+
93
+ recon_capture = ReconCapture(
94
+ database_config=reconcile_config.database_config,
95
+ recon_id=recon_id,
96
+ report_type=report_type,
97
+ source_dialect=get_dialect(reconcile_config.data_source),
98
+ ws=ws_client,
99
+ spark=spark,
100
+ metadata_config=reconcile_config.metadata_config,
101
+ local_test_run=local_test_run,
102
+ )
103
+
104
+ return reconciler, recon_capture
105
+
106
+ @staticmethod
107
+ def recon_one(
108
+ spark: SparkSession,
109
+ reconciler: Reconciliation,
110
+ recon_capture: ReconCapture,
111
+ reconcile_config: ReconcileConfig,
112
+ table_conf: Table,
113
+ ):
114
+ normalized_table_conf = NormalizeReconConfigService(
115
+ reconciler.source, reconciler.target
116
+ ).normalize_recon_table_config(table_conf)
117
+
118
+ schema_reconcile_output, data_reconcile_output, recon_process_duration = TriggerReconService._do_recon_one(
119
+ reconciler, reconcile_config, normalized_table_conf
120
+ )
121
+
122
+ TriggerReconService.persist_delta_table(
123
+ spark,
124
+ reconciler,
125
+ recon_capture,
126
+ schema_reconcile_output,
127
+ data_reconcile_output,
128
+ reconcile_config,
129
+ normalized_table_conf,
130
+ recon_process_duration,
131
+ )
132
+
133
+ @staticmethod
134
+ def _do_recon_one(reconciler: Reconciliation, reconcile_config: ReconcileConfig, table_conf: Table):
135
+ recon_process_duration = ReconcileProcessDuration(start_ts=str(datetime.now()), end_ts=None)
136
+ schema_reconcile_output = SchemaReconcileOutput(is_valid=True)
137
+ data_reconcile_output = DataReconcileOutput()
138
+
139
+ try:
140
+ src_schema, tgt_schema = TriggerReconService.get_schemas(
141
+ reconciler.source, reconciler.target, table_conf, reconcile_config.database_config, True
142
+ )
143
+ except DataSourceRuntimeException as e:
144
+ schema_reconcile_output = SchemaReconcileOutput(is_valid=False, exception=str(e))
145
+ else:
146
+ if reconciler.report_type in {"schema", "all"}:
147
+ schema_reconcile_output = TriggerReconService._run_reconcile_schema(
148
+ reconciler=reconciler,
149
+ table_conf=table_conf,
150
+ src_schema=src_schema,
151
+ tgt_schema=tgt_schema,
152
+ )
153
+ logger.warning("Schema comparison is completed.")
154
+
155
+ if reconciler.report_type in {"data", "row", "all"}:
156
+ data_reconcile_output = TriggerReconService._run_reconcile_data(
157
+ reconciler=reconciler,
158
+ table_conf=table_conf,
159
+ src_schema=src_schema,
160
+ tgt_schema=tgt_schema,
161
+ )
162
+ logger.warning(f"Reconciliation for '{reconciler.report_type}' report completed.")
163
+
164
+ recon_process_duration.end_ts = str(datetime.now())
165
+ return schema_reconcile_output, data_reconcile_output, recon_process_duration
166
+
167
+ @staticmethod
168
+ def get_schemas(
169
+ source: DataSource,
170
+ target: DataSource,
171
+ table_conf: Table,
172
+ database_config: DatabaseConfig,
173
+ normalize: bool,
174
+ ) -> tuple[list[Schema], list[Schema]]:
175
+ src_schema = source.get_schema(
176
+ catalog=database_config.source_catalog,
177
+ schema=database_config.source_schema,
178
+ table=table_conf.source_name,
179
+ normalize=normalize,
180
+ )
181
+
182
+ tgt_schema = target.get_schema(
183
+ catalog=database_config.target_catalog,
184
+ schema=database_config.target_schema,
185
+ table=table_conf.target_name,
186
+ normalize=normalize,
187
+ )
188
+
189
+ return src_schema, tgt_schema
190
+
191
+ @staticmethod
192
+ def _run_reconcile_schema(
193
+ reconciler: Reconciliation,
194
+ table_conf: Table,
195
+ src_schema: list[Schema],
196
+ tgt_schema: list[Schema],
197
+ ):
198
+ try:
199
+ return reconciler.reconcile_schema(table_conf=table_conf, src_schema=src_schema, tgt_schema=tgt_schema)
200
+ except PySparkException as e:
201
+ return SchemaReconcileOutput(is_valid=False, exception=str(e))
202
+
203
+ @staticmethod
204
+ def _run_reconcile_data(
205
+ reconciler: Reconciliation,
206
+ table_conf: Table,
207
+ src_schema: list[Schema],
208
+ tgt_schema: list[Schema],
209
+ ) -> DataReconcileOutput:
210
+ try:
211
+ return reconciler.reconcile_data(table_conf=table_conf, src_schema=src_schema, tgt_schema=tgt_schema)
212
+ except DataSourceRuntimeException as e:
213
+ return DataReconcileOutput(exception=str(e))
214
+
215
+ @staticmethod
216
+ def persist_delta_table(
217
+ spark: SparkSession,
218
+ reconciler: Reconciliation,
219
+ recon_capture: ReconCapture,
220
+ schema_reconcile_output: SchemaReconcileOutput,
221
+ data_reconcile_output: DataReconcileOutput,
222
+ reconcile_config: ReconcileConfig,
223
+ table_conf: Table,
224
+ recon_process_duration: ReconcileProcessDuration,
225
+ ):
226
+ recon_capture.start(
227
+ data_reconcile_output=data_reconcile_output,
228
+ schema_reconcile_output=schema_reconcile_output,
229
+ table_conf=table_conf,
230
+ recon_process_duration=recon_process_duration,
231
+ record_count=reconciler.get_record_count(table_conf, reconciler.report_type),
232
+ )
233
+ if reconciler.report_type != "schema":
234
+ ReconIntermediatePersist(
235
+ spark=spark, path=utils.generate_volume_path(table_conf, reconcile_config.metadata_config)
236
+ ).clean_unmatched_df_from_volume()
237
+
238
+ @staticmethod
239
+ def verify_successful_reconciliation(
240
+ reconcile_output: ReconcileOutput, operation_name: str = "reconcile"
241
+ ) -> ReconcileOutput:
242
+ for table_output in reconcile_output.results:
243
+ if table_output.exception_message or (
244
+ table_output.status.column is False
245
+ or table_output.status.row is False
246
+ or table_output.status.schema is False
247
+ or table_output.status.aggregate is False
248
+ ):
249
+ raise ReconciliationException(
250
+ f" Reconciliation failed for one or more tables. Please check the recon metrics for more details."
251
+ f" **{operation_name}** failed.",
252
+ reconcile_output=reconcile_output,
253
+ )
254
+
255
+ logger.info("Reconciliation completed successfully.")
256
+ return reconcile_output
@@ -0,0 +1,38 @@
1
+ import logging
2
+
3
+ from pyspark.sql import SparkSession
4
+
5
+ from databricks.sdk import WorkspaceClient
6
+
7
+ from databricks.labs.lakebridge.config import ReconcileMetadataConfig
8
+ from databricks.labs.lakebridge.reconcile.connectors.source_adapter import create_adapter
9
+ from databricks.labs.lakebridge.reconcile.exception import InvalidInputException
10
+ from databricks.labs.lakebridge.reconcile.recon_config import Table
11
+ from databricks.labs.lakebridge.transpiler.sqlglot.dialect_utils import get_dialect
12
+
13
+ logger = logging.getLogger(__name__)
14
+
15
+
16
+ def initialise_data_source(
17
+ ws: WorkspaceClient,
18
+ spark: SparkSession,
19
+ engine: str,
20
+ secret_scope: str,
21
+ ):
22
+ source = create_adapter(engine=get_dialect(engine), spark=spark, ws=ws, secret_scope=secret_scope)
23
+ target = create_adapter(engine=get_dialect("databricks"), spark=spark, ws=ws, secret_scope=secret_scope)
24
+
25
+ return source, target
26
+
27
+
28
+ def validate_input(input_value: str, list_of_value: set, message: str):
29
+ if input_value not in list_of_value:
30
+ error_message = f"{message} --> {input_value} is not one of {list_of_value}"
31
+ logger.error(error_message)
32
+ raise InvalidInputException(error_message)
33
+
34
+
35
+ def generate_volume_path(table_conf: Table, metadata_config: ReconcileMetadataConfig):
36
+ catalog = metadata_config.catalog
37
+ schema = metadata_config.schema
38
+ return f"/Volumes/{catalog}/{schema}/{metadata_config.volume}/{table_conf.source_name}_{table_conf.target_name}/"
@@ -48,6 +48,26 @@ class TranspilingContext:
48
48
  transpiled_code: str | None = None
49
49
 
50
50
 
51
+ def _validate_transpiled_sql(context: TranspilingContext, content: str, error_list: list[TranspileError]) -> str:
52
+ if context.validator is None:
53
+ return content
54
+ validation_result = _validation(context.validator, context.config, str(content))
55
+ # Potentially expensive, only evaluate if debug is enabled
56
+ if logger.isEnabledFor(logging.DEBUG):
57
+ msg = f"Finished validating transpiled code for file: {context.input_path} (result: {validation_result})"
58
+ logger.debug(msg)
59
+ if validation_result.exception_msg is not None:
60
+ error = TranspileError(
61
+ "VALIDATION_ERROR",
62
+ ErrorKind.VALIDATION,
63
+ ErrorSeverity.WARNING,
64
+ context.input_path,
65
+ validation_result.exception_msg,
66
+ )
67
+ error_list.append(error)
68
+ return validation_result.validated_sql
69
+
70
+
51
71
  async def _process_one_file(context: TranspilingContext) -> tuple[int, list[TranspileError]]:
52
72
  input_path = context.input_path
53
73
 
@@ -89,29 +109,29 @@ async def _process_one_file(context: TranspilingContext) -> tuple[int, list[Tran
89
109
  assert output_path is not None, "Output path must be set in the context"
90
110
  output_path.parent.mkdir(exist_ok=True)
91
111
 
92
- if _is_combined_result(transpile_result):
93
- _process_combined_result(context, error_list)
112
+ if _is_mime_result(transpile_result):
113
+ _process_mime_result(context, error_list)
94
114
  else:
95
- _process_single_result(context, error_list)
115
+ _process_non_mime_result(context, error_list)
96
116
 
97
117
  return transpile_result.success_count, error_list
98
118
 
99
119
 
100
- def _is_combined_result(result: TranspileResult):
120
+ def _is_mime_result(result: TranspileResult):
101
121
  return result.transpiled_code.startswith("Content-Type: multipart/mixed; boundary=")
102
122
 
103
123
 
104
- def _process_combined_result(context: TranspilingContext, _error_list: list[TranspileError]) -> None:
124
+ def _process_mime_result(context: TranspilingContext, error_list: list[TranspileError]) -> None:
105
125
  # TODO error handling
106
126
  # Added policy to process quoted-printable encoded
107
127
  parser = EmailParser(policy=policy.default)
108
128
  transpiled_code: str = cast(str, context.transpiled_code)
109
129
  message: Message = parser.parsestr(transpiled_code)
110
130
  for part in message.walk():
111
- _process_combined_part(context, part)
131
+ _process_combined_part(context, part, error_list)
112
132
 
113
133
 
114
- def _process_combined_part(context: TranspilingContext, part: Message) -> None:
134
+ def _process_combined_part(context: TranspilingContext, part: Message, error_list: list[TranspileError]) -> None:
115
135
  if part.get_content_type() != "text/plain":
116
136
  return # TODO Need to handle other content types, e.g., text/binary, application/json, etc.
117
137
  filename = part.get_filename()
@@ -133,35 +153,21 @@ def _process_combined_part(context: TranspilingContext, part: Message) -> None:
133
153
  folder.mkdir(parents=True, exist_ok=True)
134
154
  output = folder / segments[-1]
135
155
  logger.debug(f"Writing output to: {output}")
156
+ # Only validate if output file has .sql suffix
157
+ if output.suffix == ".sql":
158
+ content = _validate_transpiled_sql(context, content, error_list)
136
159
  output.write_text(content)
137
160
 
138
161
 
139
- def _process_single_result(context: TranspilingContext, error_list: list[TranspileError]) -> None:
162
+ def _process_non_mime_result(context: TranspilingContext, error_list: list[TranspileError]) -> None:
140
163
 
141
164
  output_code: str = context.transpiled_code or ""
165
+ output_path = cast(Path, context.output_path)
142
166
 
143
167
  if any(err.kind == ErrorKind.PARSING for err in error_list):
144
168
  output_code = context.source_code or ""
145
-
146
- elif context.validator:
147
- logger.debug(f"Validating transpiled code for file: {context.input_path}")
148
- validation_result = _validation(context.validator, context.config, str(context.transpiled_code))
149
- # Potentially expensive, only evaluate if debug is enabled
150
- if logger.isEnabledFor(logging.DEBUG):
151
- msg = f"Finished validating transpiled code for file: {context.input_path} (result: {validation_result})"
152
- logger.debug(msg)
153
- if validation_result.exception_msg is not None:
154
- error = TranspileError(
155
- "VALIDATION_ERROR",
156
- ErrorKind.VALIDATION,
157
- ErrorSeverity.WARNING,
158
- context.input_path,
159
- validation_result.exception_msg,
160
- )
161
- error_list.append(error)
162
- output_code = validation_result.validated_sql
163
-
164
- output_path = cast(Path, context.output_path)
169
+ elif output_path.suffix == ".sql":
170
+ output_code = _validate_transpiled_sql(context, output_code, error_list)
165
171
  with output_path.open("w") as w:
166
172
  # The above adds a java-style comment block at the top of the output file
167
173
  # This would break .py or .json outputs so we disable it for now.