databricks-labs-lakebridge 0.10.7__py3-none-any.whl → 0.10.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (30) hide show
  1. databricks/labs/lakebridge/__about__.py +1 -1
  2. databricks/labs/lakebridge/assessments/profiler_validator.py +103 -0
  3. databricks/labs/lakebridge/base_install.py +1 -5
  4. databricks/labs/lakebridge/cli.py +13 -6
  5. databricks/labs/lakebridge/helpers/validation.py +5 -3
  6. databricks/labs/lakebridge/install.py +40 -481
  7. databricks/labs/lakebridge/reconcile/connectors/data_source.py +9 -5
  8. databricks/labs/lakebridge/reconcile/connectors/databricks.py +2 -1
  9. databricks/labs/lakebridge/reconcile/connectors/oracle.py +2 -1
  10. databricks/labs/lakebridge/reconcile/connectors/secrets.py +19 -1
  11. databricks/labs/lakebridge/reconcile/connectors/snowflake.py +50 -29
  12. databricks/labs/lakebridge/reconcile/connectors/tsql.py +2 -1
  13. databricks/labs/lakebridge/reconcile/query_builder/base.py +50 -11
  14. databricks/labs/lakebridge/reconcile/query_builder/expression_generator.py +8 -2
  15. databricks/labs/lakebridge/reconcile/query_builder/hash_query.py +7 -13
  16. databricks/labs/lakebridge/reconcile/query_builder/sampling_query.py +18 -19
  17. databricks/labs/lakebridge/reconcile/query_builder/threshold_query.py +36 -15
  18. databricks/labs/lakebridge/reconcile/recon_config.py +0 -15
  19. databricks/labs/lakebridge/reconcile/reconciliation.py +4 -1
  20. databricks/labs/lakebridge/reconcile/trigger_recon_aggregate_service.py +11 -31
  21. databricks/labs/lakebridge/reconcile/trigger_recon_service.py +4 -1
  22. databricks/labs/lakebridge/transpiler/execute.py +34 -28
  23. databricks/labs/lakebridge/transpiler/installers.py +523 -0
  24. databricks/labs/lakebridge/transpiler/lsp/lsp_engine.py +2 -0
  25. {databricks_labs_lakebridge-0.10.7.dist-info → databricks_labs_lakebridge-0.10.8.dist-info}/METADATA +1 -1
  26. {databricks_labs_lakebridge-0.10.7.dist-info → databricks_labs_lakebridge-0.10.8.dist-info}/RECORD +30 -28
  27. {databricks_labs_lakebridge-0.10.7.dist-info → databricks_labs_lakebridge-0.10.8.dist-info}/WHEEL +0 -0
  28. {databricks_labs_lakebridge-0.10.7.dist-info → databricks_labs_lakebridge-0.10.8.dist-info}/entry_points.txt +0 -0
  29. {databricks_labs_lakebridge-0.10.7.dist-info → databricks_labs_lakebridge-0.10.8.dist-info}/licenses/LICENSE +0 -0
  30. {databricks_labs_lakebridge-0.10.7.dist-info → databricks_labs_lakebridge-0.10.8.dist-info}/licenses/NOTICE +0 -0
@@ -10,15 +10,13 @@ from databricks.labs.lakebridge.reconcile.recon_capture import (
10
10
  ReconIntermediatePersist,
11
11
  generate_final_reconcile_aggregate_output,
12
12
  )
13
- from databricks.labs.lakebridge.reconcile.recon_config import Table, Schema, AGG_RECONCILE_OPERATION_NAME
13
+ from databricks.labs.lakebridge.reconcile.recon_config import AGG_RECONCILE_OPERATION_NAME
14
14
  from databricks.labs.lakebridge.reconcile.recon_output_config import (
15
15
  ReconcileProcessDuration,
16
16
  AggregateQueryOutput,
17
17
  DataReconcileOutput,
18
18
  )
19
- from databricks.labs.lakebridge.reconcile.reconciliation import Reconciliation
20
19
  from databricks.labs.lakebridge.reconcile.trigger_recon_service import TriggerReconService
21
- from databricks.labs.lakebridge.reconcile.normalize_recon_config_service import NormalizeReconConfigService
22
20
 
23
21
 
24
22
  class TriggerReconAggregateService:
@@ -36,42 +34,36 @@ class TriggerReconAggregateService:
36
34
 
37
35
  # Get the Aggregated Reconciliation Output for each table
38
36
  for table_conf in table_recon.tables:
39
- normalized_table_conf = NormalizeReconConfigService(
40
- reconciler.source, reconciler.target
41
- ).normalize_recon_table_config(table_conf)
42
-
43
37
  recon_process_duration = ReconcileProcessDuration(start_ts=str(datetime.now()), end_ts=None)
44
38
  try:
45
39
  src_schema, tgt_schema = TriggerReconService.get_schemas(
46
- reconciler.source, reconciler.target, normalized_table_conf, reconcile_config.database_config
40
+ reconciler.source, reconciler.target, table_conf, reconcile_config.database_config, False
47
41
  )
48
42
  except DataSourceRuntimeException as e:
49
43
  raise ReconciliationException(message=str(e)) from e
50
44
 
51
- assert normalized_table_conf.aggregates, "Aggregates must be defined for Aggregates Reconciliation"
45
+ assert table_conf.aggregates, "Aggregates must be defined for Aggregates Reconciliation"
52
46
 
53
- table_reconcile_agg_output_list: list[AggregateQueryOutput] = (
54
- TriggerReconAggregateService._run_reconcile_aggregates(
55
- reconciler=reconciler,
56
- table_conf=normalized_table_conf,
57
- src_schema=src_schema,
58
- tgt_schema=tgt_schema,
59
- )
60
- )
47
+ try:
48
+ table_reconcile_agg_output_list = reconciler.reconcile_aggregates(table_conf, src_schema, tgt_schema)
49
+ except DataSourceRuntimeException as e:
50
+ table_reconcile_agg_output_list = [
51
+ AggregateQueryOutput(reconcile_output=DataReconcileOutput(exception=str(e)), rule=None)
52
+ ]
61
53
 
62
54
  recon_process_duration.end_ts = str(datetime.now())
63
55
 
64
56
  # Persist the data to the delta tables
65
57
  recon_capture.store_aggregates_metrics(
66
58
  reconcile_agg_output_list=table_reconcile_agg_output_list,
67
- table_conf=normalized_table_conf,
59
+ table_conf=table_conf,
68
60
  recon_process_duration=recon_process_duration,
69
61
  )
70
62
 
71
63
  (
72
64
  ReconIntermediatePersist(
73
65
  spark=spark,
74
- path=utils.generate_volume_path(normalized_table_conf, reconcile_config.metadata_config),
66
+ path=utils.generate_volume_path(table_conf, reconcile_config.metadata_config),
75
67
  ).clean_unmatched_df_from_volume()
76
68
  )
77
69
 
@@ -84,15 +76,3 @@ class TriggerReconAggregateService:
84
76
  ),
85
77
  operation_name=AGG_RECONCILE_OPERATION_NAME,
86
78
  )
87
-
88
- @staticmethod
89
- def _run_reconcile_aggregates(
90
- reconciler: Reconciliation,
91
- table_conf: Table,
92
- src_schema: list[Schema],
93
- tgt_schema: list[Schema],
94
- ) -> list[AggregateQueryOutput]:
95
- try:
96
- return reconciler.reconcile_aggregates(table_conf, src_schema, tgt_schema)
97
- except DataSourceRuntimeException as e:
98
- return [AggregateQueryOutput(reconcile_output=DataReconcileOutput(exception=str(e)), rule=None)]
@@ -138,7 +138,7 @@ class TriggerReconService:
138
138
 
139
139
  try:
140
140
  src_schema, tgt_schema = TriggerReconService.get_schemas(
141
- reconciler.source, reconciler.target, table_conf, reconcile_config.database_config
141
+ reconciler.source, reconciler.target, table_conf, reconcile_config.database_config, True
142
142
  )
143
143
  except DataSourceRuntimeException as e:
144
144
  schema_reconcile_output = SchemaReconcileOutput(is_valid=False, exception=str(e))
@@ -170,17 +170,20 @@ class TriggerReconService:
170
170
  target: DataSource,
171
171
  table_conf: Table,
172
172
  database_config: DatabaseConfig,
173
+ normalize: bool,
173
174
  ) -> tuple[list[Schema], list[Schema]]:
174
175
  src_schema = source.get_schema(
175
176
  catalog=database_config.source_catalog,
176
177
  schema=database_config.source_schema,
177
178
  table=table_conf.source_name,
179
+ normalize=normalize,
178
180
  )
179
181
 
180
182
  tgt_schema = target.get_schema(
181
183
  catalog=database_config.target_catalog,
182
184
  schema=database_config.target_schema,
183
185
  table=table_conf.target_name,
186
+ normalize=normalize,
184
187
  )
185
188
 
186
189
  return src_schema, tgt_schema
@@ -48,6 +48,26 @@ class TranspilingContext:
48
48
  transpiled_code: str | None = None
49
49
 
50
50
 
51
+ def _validate_transpiled_sql(context: TranspilingContext, content: str, error_list: list[TranspileError]) -> str:
52
+ if context.validator is None:
53
+ return content
54
+ validation_result = _validation(context.validator, context.config, str(content))
55
+ # Potentially expensive, only evaluate if debug is enabled
56
+ if logger.isEnabledFor(logging.DEBUG):
57
+ msg = f"Finished validating transpiled code for file: {context.input_path} (result: {validation_result})"
58
+ logger.debug(msg)
59
+ if validation_result.exception_msg is not None:
60
+ error = TranspileError(
61
+ "VALIDATION_ERROR",
62
+ ErrorKind.VALIDATION,
63
+ ErrorSeverity.WARNING,
64
+ context.input_path,
65
+ validation_result.exception_msg,
66
+ )
67
+ error_list.append(error)
68
+ return validation_result.validated_sql
69
+
70
+
51
71
  async def _process_one_file(context: TranspilingContext) -> tuple[int, list[TranspileError]]:
52
72
  input_path = context.input_path
53
73
 
@@ -89,29 +109,29 @@ async def _process_one_file(context: TranspilingContext) -> tuple[int, list[Tran
89
109
  assert output_path is not None, "Output path must be set in the context"
90
110
  output_path.parent.mkdir(exist_ok=True)
91
111
 
92
- if _is_combined_result(transpile_result):
93
- _process_combined_result(context, error_list)
112
+ if _is_mime_result(transpile_result):
113
+ _process_mime_result(context, error_list)
94
114
  else:
95
- _process_single_result(context, error_list)
115
+ _process_non_mime_result(context, error_list)
96
116
 
97
117
  return transpile_result.success_count, error_list
98
118
 
99
119
 
100
- def _is_combined_result(result: TranspileResult):
120
+ def _is_mime_result(result: TranspileResult):
101
121
  return result.transpiled_code.startswith("Content-Type: multipart/mixed; boundary=")
102
122
 
103
123
 
104
- def _process_combined_result(context: TranspilingContext, _error_list: list[TranspileError]) -> None:
124
+ def _process_mime_result(context: TranspilingContext, error_list: list[TranspileError]) -> None:
105
125
  # TODO error handling
106
126
  # Added policy to process quoted-printable encoded
107
127
  parser = EmailParser(policy=policy.default)
108
128
  transpiled_code: str = cast(str, context.transpiled_code)
109
129
  message: Message = parser.parsestr(transpiled_code)
110
130
  for part in message.walk():
111
- _process_combined_part(context, part)
131
+ _process_combined_part(context, part, error_list)
112
132
 
113
133
 
114
- def _process_combined_part(context: TranspilingContext, part: Message) -> None:
134
+ def _process_combined_part(context: TranspilingContext, part: Message, error_list: list[TranspileError]) -> None:
115
135
  if part.get_content_type() != "text/plain":
116
136
  return # TODO Need to handle other content types, e.g., text/binary, application/json, etc.
117
137
  filename = part.get_filename()
@@ -133,35 +153,21 @@ def _process_combined_part(context: TranspilingContext, part: Message) -> None:
133
153
  folder.mkdir(parents=True, exist_ok=True)
134
154
  output = folder / segments[-1]
135
155
  logger.debug(f"Writing output to: {output}")
156
+ # Only validate if output file has .sql suffix
157
+ if output.suffix == ".sql":
158
+ content = _validate_transpiled_sql(context, content, error_list)
136
159
  output.write_text(content)
137
160
 
138
161
 
139
- def _process_single_result(context: TranspilingContext, error_list: list[TranspileError]) -> None:
162
+ def _process_non_mime_result(context: TranspilingContext, error_list: list[TranspileError]) -> None:
140
163
 
141
164
  output_code: str = context.transpiled_code or ""
165
+ output_path = cast(Path, context.output_path)
142
166
 
143
167
  if any(err.kind == ErrorKind.PARSING for err in error_list):
144
168
  output_code = context.source_code or ""
145
-
146
- elif context.validator:
147
- logger.debug(f"Validating transpiled code for file: {context.input_path}")
148
- validation_result = _validation(context.validator, context.config, str(context.transpiled_code))
149
- # Potentially expensive, only evaluate if debug is enabled
150
- if logger.isEnabledFor(logging.DEBUG):
151
- msg = f"Finished validating transpiled code for file: {context.input_path} (result: {validation_result})"
152
- logger.debug(msg)
153
- if validation_result.exception_msg is not None:
154
- error = TranspileError(
155
- "VALIDATION_ERROR",
156
- ErrorKind.VALIDATION,
157
- ErrorSeverity.WARNING,
158
- context.input_path,
159
- validation_result.exception_msg,
160
- )
161
- error_list.append(error)
162
- output_code = validation_result.validated_sql
163
-
164
- output_path = cast(Path, context.output_path)
169
+ elif output_path.suffix == ".sql":
170
+ output_code = _validate_transpiled_sql(context, output_code, error_list)
165
171
  with output_path.open("w") as w:
166
172
  # The above adds a java-style comment block at the top of the output file
167
173
  # This would break .py or .json outputs so we disable it for now.