databricks-labs-lakebridge 0.10.6__py3-none-any.whl → 0.10.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. databricks/labs/lakebridge/__about__.py +1 -1
  2. databricks/labs/lakebridge/analyzer/__init__.py +0 -0
  3. databricks/labs/lakebridge/analyzer/lakebridge_analyzer.py +95 -0
  4. databricks/labs/lakebridge/base_install.py +24 -3
  5. databricks/labs/lakebridge/cli.py +19 -53
  6. databricks/labs/lakebridge/contexts/application.py +7 -0
  7. databricks/labs/lakebridge/deployment/job.py +2 -2
  8. databricks/labs/lakebridge/helpers/file_utils.py +36 -0
  9. databricks/labs/lakebridge/install.py +187 -157
  10. databricks/labs/lakebridge/reconcile/compare.py +70 -33
  11. databricks/labs/lakebridge/reconcile/connectors/data_source.py +19 -0
  12. databricks/labs/lakebridge/reconcile/connectors/databricks.py +11 -1
  13. databricks/labs/lakebridge/reconcile/connectors/dialect_utils.py +126 -0
  14. databricks/labs/lakebridge/reconcile/connectors/models.py +7 -0
  15. databricks/labs/lakebridge/reconcile/connectors/oracle.py +11 -1
  16. databricks/labs/lakebridge/reconcile/connectors/snowflake.py +14 -2
  17. databricks/labs/lakebridge/reconcile/connectors/tsql.py +27 -2
  18. databricks/labs/lakebridge/reconcile/constants.py +4 -3
  19. databricks/labs/lakebridge/reconcile/execute.py +9 -810
  20. databricks/labs/lakebridge/reconcile/normalize_recon_config_service.py +133 -0
  21. databricks/labs/lakebridge/reconcile/query_builder/base.py +3 -7
  22. databricks/labs/lakebridge/reconcile/recon_config.py +3 -0
  23. databricks/labs/lakebridge/reconcile/recon_output_config.py +2 -1
  24. databricks/labs/lakebridge/reconcile/reconciliation.py +508 -0
  25. databricks/labs/lakebridge/reconcile/schema_compare.py +26 -19
  26. databricks/labs/lakebridge/reconcile/trigger_recon_aggregate_service.py +98 -0
  27. databricks/labs/lakebridge/reconcile/trigger_recon_service.py +253 -0
  28. databricks/labs/lakebridge/reconcile/utils.py +38 -0
  29. databricks/labs/lakebridge/transpiler/lsp/lsp_engine.py +45 -60
  30. databricks/labs/lakebridge/transpiler/sqlglot/dialect_utils.py +2 -0
  31. databricks/labs/lakebridge/transpiler/transpile_engine.py +0 -18
  32. {databricks_labs_lakebridge-0.10.6.dist-info → databricks_labs_lakebridge-0.10.7.dist-info}/METADATA +1 -1
  33. {databricks_labs_lakebridge-0.10.6.dist-info → databricks_labs_lakebridge-0.10.7.dist-info}/RECORD +37 -28
  34. {databricks_labs_lakebridge-0.10.6.dist-info → databricks_labs_lakebridge-0.10.7.dist-info}/WHEEL +0 -0
  35. {databricks_labs_lakebridge-0.10.6.dist-info → databricks_labs_lakebridge-0.10.7.dist-info}/entry_points.txt +0 -0
  36. {databricks_labs_lakebridge-0.10.6.dist-info → databricks_labs_lakebridge-0.10.7.dist-info}/licenses/LICENSE +0 -0
  37. {databricks_labs_lakebridge-0.10.6.dist-info → databricks_labs_lakebridge-0.10.7.dist-info}/licenses/NOTICE +0 -0
@@ -1,10 +1,10 @@
1
1
  import logging
2
- from dataclasses import asdict
3
2
 
4
3
  from pyspark.sql import DataFrame, SparkSession
5
4
  from pyspark.sql.types import BooleanType, StringType, StructField, StructType
6
5
  from sqlglot import Dialect, parse_one
7
6
 
7
+ from databricks.labs.lakebridge.reconcile.connectors.dialect_utils import DialectUtils
8
8
  from databricks.labs.lakebridge.transpiler.sqlglot.dialect_utils import get_dialect
9
9
  from databricks.labs.lakebridge.reconcile.recon_config import Schema, Table
10
10
  from databricks.labs.lakebridge.reconcile.recon_output_config import SchemaMatchResult, SchemaReconcileOutput
@@ -20,8 +20,7 @@ class SchemaCompare:
20
20
  ):
21
21
  self.spark = spark
22
22
 
23
- # Define the schema for the schema compare DataFrame
24
- _schema_compare_schema: StructType = StructType(
23
+ _schema_compare_output_schema: StructType = StructType(
25
24
  [
26
25
  StructField("source_column", StringType(), False),
27
26
  StructField("source_datatype", StringType(), False),
@@ -47,14 +46,16 @@ class SchemaCompare:
47
46
  target_column_map = table_conf.to_src_col_map or {}
48
47
  master_schema_match_res = [
49
48
  SchemaMatchResult(
50
- source_column=s.column_name,
51
- databricks_column=target_column_map.get(s.column_name, s.column_name),
49
+ source_column_normalized=s.source_normalized_column_name,
50
+ source_column_normalized_ansi=s.ansi_normalized_column_name,
52
51
  source_datatype=s.data_type,
52
+ databricks_column=target_column_map.get(s.ansi_normalized_column_name, s.ansi_normalized_column_name),
53
53
  databricks_datatype=next(
54
54
  (
55
55
  tgt.data_type
56
56
  for tgt in databricks_schema
57
- if tgt.column_name == target_column_map.get(s.column_name, s.column_name)
57
+ if tgt.ansi_normalized_column_name
58
+ == target_column_map.get(s.ansi_normalized_column_name, s.ansi_normalized_column_name)
58
59
  ),
59
60
  "",
60
61
  ),
@@ -63,16 +64,22 @@ class SchemaCompare:
63
64
  ]
64
65
  return master_schema_match_res
65
66
 
66
- def _create_dataframe(self, data: list, schema: StructType) -> DataFrame:
67
- """
68
- :param data: Expectation is list of dataclass
69
- :param schema: Target schema
70
- :return: DataFrame
71
- """
72
- data = [tuple(asdict(item).values()) for item in data]
73
- df = self.spark.createDataFrame(data, schema)
67
+ def _create_output_dataframe(self, data: list[SchemaMatchResult], schema: StructType) -> DataFrame:
68
+ """Return a user-friendly dataframe for schema compare result."""
69
+ transformed = []
70
+ for item in data:
71
+ output = tuple(
72
+ [
73
+ DialectUtils.unnormalize_identifier(item.source_column_normalized_ansi),
74
+ item.source_datatype,
75
+ DialectUtils.unnormalize_identifier(item.databricks_column),
76
+ item.databricks_datatype,
77
+ item.is_valid,
78
+ ]
79
+ )
80
+ transformed.append(output)
74
81
 
75
- return df
82
+ return self.spark.createDataFrame(transformed, schema)
76
83
 
77
84
  @classmethod
78
85
  def _parse(cls, source: Dialect, column: str, data_type: str) -> str:
@@ -88,10 +95,10 @@ class SchemaCompare:
88
95
 
89
96
  @classmethod
90
97
  def _validate_parsed_query(cls, master: SchemaMatchResult, parsed_query) -> None:
91
- databricks_query = f"create table dummy ({master.source_column} {master.databricks_datatype})"
98
+ databricks_query = f"create table dummy ({master.source_column_normalized_ansi} {master.databricks_datatype})"
92
99
  logger.info(
93
100
  f"""
94
- Source datatype: create table dummy ({master.source_column} {master.source_datatype})
101
+ Source datatype: create table dummy ({master.source_column_normalized} {master.source_datatype})
95
102
  Parse datatype: {parsed_query}
96
103
  Databricks datatype: {databricks_query}
97
104
  """
@@ -116,11 +123,11 @@ class SchemaCompare:
116
123
  master_schema = self._build_master_schema(source_schema, databricks_schema, table_conf)
117
124
  for master in master_schema:
118
125
  if not isinstance(source, Databricks):
119
- parsed_query = self._parse(source, master.source_column, master.source_datatype)
126
+ parsed_query = self._parse(source, master.source_column_normalized, master.source_datatype)
120
127
  self._validate_parsed_query(master, parsed_query)
121
128
  elif master.source_datatype.lower() != master.databricks_datatype.lower():
122
129
  master.is_valid = False
123
130
 
124
- df = self._create_dataframe(master_schema, self._schema_compare_schema)
131
+ df = self._create_output_dataframe(master_schema, self._schema_compare_output_schema)
125
132
  final_result = self._table_schema_status(master_schema)
126
133
  return SchemaReconcileOutput(final_result, df)
@@ -0,0 +1,98 @@
1
+ from datetime import datetime
2
+
3
+ from pyspark.sql import SparkSession
4
+ from databricks.sdk import WorkspaceClient
5
+
6
+ from databricks.labs.lakebridge.config import ReconcileConfig, TableRecon
7
+ from databricks.labs.lakebridge.reconcile import utils
8
+ from databricks.labs.lakebridge.reconcile.exception import DataSourceRuntimeException, ReconciliationException
9
+ from databricks.labs.lakebridge.reconcile.recon_capture import (
10
+ ReconIntermediatePersist,
11
+ generate_final_reconcile_aggregate_output,
12
+ )
13
+ from databricks.labs.lakebridge.reconcile.recon_config import Table, Schema, AGG_RECONCILE_OPERATION_NAME
14
+ from databricks.labs.lakebridge.reconcile.recon_output_config import (
15
+ ReconcileProcessDuration,
16
+ AggregateQueryOutput,
17
+ DataReconcileOutput,
18
+ )
19
+ from databricks.labs.lakebridge.reconcile.reconciliation import Reconciliation
20
+ from databricks.labs.lakebridge.reconcile.trigger_recon_service import TriggerReconService
21
+ from databricks.labs.lakebridge.reconcile.normalize_recon_config_service import NormalizeReconConfigService
22
+
23
+
24
+ class TriggerReconAggregateService:
25
+ @staticmethod
26
+ def trigger_recon_aggregates(
27
+ ws: WorkspaceClient,
28
+ spark: SparkSession,
29
+ table_recon: TableRecon,
30
+ reconcile_config: ReconcileConfig,
31
+ local_test_run: bool = False,
32
+ ):
33
+ reconciler, recon_capture = TriggerReconService.create_recon_dependencies(
34
+ ws, spark, reconcile_config, local_test_run
35
+ )
36
+
37
+ # Get the Aggregated Reconciliation Output for each table
38
+ for table_conf in table_recon.tables:
39
+ normalized_table_conf = NormalizeReconConfigService(
40
+ reconciler.source, reconciler.target
41
+ ).normalize_recon_table_config(table_conf)
42
+
43
+ recon_process_duration = ReconcileProcessDuration(start_ts=str(datetime.now()), end_ts=None)
44
+ try:
45
+ src_schema, tgt_schema = TriggerReconService.get_schemas(
46
+ reconciler.source, reconciler.target, normalized_table_conf, reconcile_config.database_config
47
+ )
48
+ except DataSourceRuntimeException as e:
49
+ raise ReconciliationException(message=str(e)) from e
50
+
51
+ assert normalized_table_conf.aggregates, "Aggregates must be defined for Aggregates Reconciliation"
52
+
53
+ table_reconcile_agg_output_list: list[AggregateQueryOutput] = (
54
+ TriggerReconAggregateService._run_reconcile_aggregates(
55
+ reconciler=reconciler,
56
+ table_conf=normalized_table_conf,
57
+ src_schema=src_schema,
58
+ tgt_schema=tgt_schema,
59
+ )
60
+ )
61
+
62
+ recon_process_duration.end_ts = str(datetime.now())
63
+
64
+ # Persist the data to the delta tables
65
+ recon_capture.store_aggregates_metrics(
66
+ reconcile_agg_output_list=table_reconcile_agg_output_list,
67
+ table_conf=normalized_table_conf,
68
+ recon_process_duration=recon_process_duration,
69
+ )
70
+
71
+ (
72
+ ReconIntermediatePersist(
73
+ spark=spark,
74
+ path=utils.generate_volume_path(normalized_table_conf, reconcile_config.metadata_config),
75
+ ).clean_unmatched_df_from_volume()
76
+ )
77
+
78
+ return TriggerReconService.verify_successful_reconciliation(
79
+ generate_final_reconcile_aggregate_output(
80
+ recon_id=recon_capture.recon_id,
81
+ spark=spark,
82
+ metadata_config=reconcile_config.metadata_config,
83
+ local_test_run=local_test_run,
84
+ ),
85
+ operation_name=AGG_RECONCILE_OPERATION_NAME,
86
+ )
87
+
88
+ @staticmethod
89
+ def _run_reconcile_aggregates(
90
+ reconciler: Reconciliation,
91
+ table_conf: Table,
92
+ src_schema: list[Schema],
93
+ tgt_schema: list[Schema],
94
+ ) -> list[AggregateQueryOutput]:
95
+ try:
96
+ return reconciler.reconcile_aggregates(table_conf, src_schema, tgt_schema)
97
+ except DataSourceRuntimeException as e:
98
+ return [AggregateQueryOutput(reconcile_output=DataReconcileOutput(exception=str(e)), rule=None)]
@@ -0,0 +1,253 @@
1
+ import logging
2
+ from datetime import datetime
3
+ from uuid import uuid4
4
+
5
+ from pyspark.errors import PySparkException
6
+ from pyspark.sql import SparkSession
7
+
8
+ from databricks.sdk import WorkspaceClient
9
+
10
+ from databricks.labs.lakebridge.config import ReconcileConfig, TableRecon, DatabaseConfig
11
+ from databricks.labs.lakebridge.reconcile import utils
12
+ from databricks.labs.lakebridge.reconcile.connectors.data_source import DataSource
13
+ from databricks.labs.lakebridge.reconcile.exception import DataSourceRuntimeException, ReconciliationException
14
+ from databricks.labs.lakebridge.reconcile.recon_capture import (
15
+ ReconCapture,
16
+ ReconIntermediatePersist,
17
+ generate_final_reconcile_output,
18
+ )
19
+ from databricks.labs.lakebridge.reconcile.recon_config import Table, Schema
20
+ from databricks.labs.lakebridge.reconcile.recon_output_config import (
21
+ ReconcileOutput,
22
+ ReconcileProcessDuration,
23
+ SchemaReconcileOutput,
24
+ DataReconcileOutput,
25
+ )
26
+ from databricks.labs.lakebridge.reconcile.reconciliation import Reconciliation
27
+ from databricks.labs.lakebridge.reconcile.schema_compare import SchemaCompare
28
+ from databricks.labs.lakebridge.reconcile.normalize_recon_config_service import NormalizeReconConfigService
29
+ from databricks.labs.lakebridge.transpiler.execute import verify_workspace_client
30
+ from databricks.labs.lakebridge.transpiler.sqlglot.dialect_utils import get_dialect
31
+
32
+ logger = logging.getLogger(__name__)
33
+ _RECON_REPORT_TYPES = {"schema", "data", "row", "all", "aggregate"}
34
+
35
+
36
+ class TriggerReconService:
37
+
38
+ @staticmethod
39
+ def trigger_recon(
40
+ ws: WorkspaceClient,
41
+ spark: SparkSession,
42
+ table_recon: TableRecon,
43
+ reconcile_config: ReconcileConfig,
44
+ local_test_run: bool = False,
45
+ ) -> ReconcileOutput:
46
+ reconciler, recon_capture = TriggerReconService.create_recon_dependencies(
47
+ ws, spark, reconcile_config, local_test_run
48
+ )
49
+
50
+ for table_conf in table_recon.tables:
51
+ TriggerReconService.recon_one(spark, reconciler, recon_capture, reconcile_config, table_conf)
52
+
53
+ return TriggerReconService.verify_successful_reconciliation(
54
+ generate_final_reconcile_output(
55
+ recon_id=recon_capture.recon_id,
56
+ spark=spark,
57
+ metadata_config=reconcile_config.metadata_config,
58
+ local_test_run=local_test_run,
59
+ )
60
+ )
61
+
62
+ @staticmethod
63
+ def create_recon_dependencies(
64
+ ws: WorkspaceClient, spark: SparkSession, reconcile_config: ReconcileConfig, local_test_run: bool = False
65
+ ) -> tuple[Reconciliation, ReconCapture]:
66
+ ws_client: WorkspaceClient = verify_workspace_client(ws)
67
+
68
+ # validate the report type
69
+ report_type = reconcile_config.report_type.lower()
70
+ logger.info(f"report_type: {report_type}, data_source: {reconcile_config.data_source} ")
71
+ utils.validate_input(report_type, _RECON_REPORT_TYPES, "Invalid report type")
72
+
73
+ source, target = utils.initialise_data_source(
74
+ engine=reconcile_config.data_source,
75
+ spark=spark,
76
+ ws=ws_client,
77
+ secret_scope=reconcile_config.secret_scope,
78
+ )
79
+
80
+ recon_id = str(uuid4())
81
+ # initialise the Reconciliation
82
+ reconciler = Reconciliation(
83
+ source,
84
+ target,
85
+ reconcile_config.database_config,
86
+ report_type,
87
+ SchemaCompare(spark=spark),
88
+ get_dialect(reconcile_config.data_source),
89
+ spark,
90
+ metadata_config=reconcile_config.metadata_config,
91
+ )
92
+
93
+ recon_capture = ReconCapture(
94
+ database_config=reconcile_config.database_config,
95
+ recon_id=recon_id,
96
+ report_type=report_type,
97
+ source_dialect=get_dialect(reconcile_config.data_source),
98
+ ws=ws_client,
99
+ spark=spark,
100
+ metadata_config=reconcile_config.metadata_config,
101
+ local_test_run=local_test_run,
102
+ )
103
+
104
+ return reconciler, recon_capture
105
+
106
+ @staticmethod
107
+ def recon_one(
108
+ spark: SparkSession,
109
+ reconciler: Reconciliation,
110
+ recon_capture: ReconCapture,
111
+ reconcile_config: ReconcileConfig,
112
+ table_conf: Table,
113
+ ):
114
+ normalized_table_conf = NormalizeReconConfigService(
115
+ reconciler.source, reconciler.target
116
+ ).normalize_recon_table_config(table_conf)
117
+
118
+ schema_reconcile_output, data_reconcile_output, recon_process_duration = TriggerReconService._do_recon_one(
119
+ reconciler, reconcile_config, normalized_table_conf
120
+ )
121
+
122
+ TriggerReconService.persist_delta_table(
123
+ spark,
124
+ reconciler,
125
+ recon_capture,
126
+ schema_reconcile_output,
127
+ data_reconcile_output,
128
+ reconcile_config,
129
+ normalized_table_conf,
130
+ recon_process_duration,
131
+ )
132
+
133
+ @staticmethod
134
+ def _do_recon_one(reconciler: Reconciliation, reconcile_config: ReconcileConfig, table_conf: Table):
135
+ recon_process_duration = ReconcileProcessDuration(start_ts=str(datetime.now()), end_ts=None)
136
+ schema_reconcile_output = SchemaReconcileOutput(is_valid=True)
137
+ data_reconcile_output = DataReconcileOutput()
138
+
139
+ try:
140
+ src_schema, tgt_schema = TriggerReconService.get_schemas(
141
+ reconciler.source, reconciler.target, table_conf, reconcile_config.database_config
142
+ )
143
+ except DataSourceRuntimeException as e:
144
+ schema_reconcile_output = SchemaReconcileOutput(is_valid=False, exception=str(e))
145
+ else:
146
+ if reconciler.report_type in {"schema", "all"}:
147
+ schema_reconcile_output = TriggerReconService._run_reconcile_schema(
148
+ reconciler=reconciler,
149
+ table_conf=table_conf,
150
+ src_schema=src_schema,
151
+ tgt_schema=tgt_schema,
152
+ )
153
+ logger.warning("Schema comparison is completed.")
154
+
155
+ if reconciler.report_type in {"data", "row", "all"}:
156
+ data_reconcile_output = TriggerReconService._run_reconcile_data(
157
+ reconciler=reconciler,
158
+ table_conf=table_conf,
159
+ src_schema=src_schema,
160
+ tgt_schema=tgt_schema,
161
+ )
162
+ logger.warning(f"Reconciliation for '{reconciler.report_type}' report completed.")
163
+
164
+ recon_process_duration.end_ts = str(datetime.now())
165
+ return schema_reconcile_output, data_reconcile_output, recon_process_duration
166
+
167
+ @staticmethod
168
+ def get_schemas(
169
+ source: DataSource,
170
+ target: DataSource,
171
+ table_conf: Table,
172
+ database_config: DatabaseConfig,
173
+ ) -> tuple[list[Schema], list[Schema]]:
174
+ src_schema = source.get_schema(
175
+ catalog=database_config.source_catalog,
176
+ schema=database_config.source_schema,
177
+ table=table_conf.source_name,
178
+ )
179
+
180
+ tgt_schema = target.get_schema(
181
+ catalog=database_config.target_catalog,
182
+ schema=database_config.target_schema,
183
+ table=table_conf.target_name,
184
+ )
185
+
186
+ return src_schema, tgt_schema
187
+
188
+ @staticmethod
189
+ def _run_reconcile_schema(
190
+ reconciler: Reconciliation,
191
+ table_conf: Table,
192
+ src_schema: list[Schema],
193
+ tgt_schema: list[Schema],
194
+ ):
195
+ try:
196
+ return reconciler.reconcile_schema(table_conf=table_conf, src_schema=src_schema, tgt_schema=tgt_schema)
197
+ except PySparkException as e:
198
+ return SchemaReconcileOutput(is_valid=False, exception=str(e))
199
+
200
+ @staticmethod
201
+ def _run_reconcile_data(
202
+ reconciler: Reconciliation,
203
+ table_conf: Table,
204
+ src_schema: list[Schema],
205
+ tgt_schema: list[Schema],
206
+ ) -> DataReconcileOutput:
207
+ try:
208
+ return reconciler.reconcile_data(table_conf=table_conf, src_schema=src_schema, tgt_schema=tgt_schema)
209
+ except DataSourceRuntimeException as e:
210
+ return DataReconcileOutput(exception=str(e))
211
+
212
+ @staticmethod
213
+ def persist_delta_table(
214
+ spark: SparkSession,
215
+ reconciler: Reconciliation,
216
+ recon_capture: ReconCapture,
217
+ schema_reconcile_output: SchemaReconcileOutput,
218
+ data_reconcile_output: DataReconcileOutput,
219
+ reconcile_config: ReconcileConfig,
220
+ table_conf: Table,
221
+ recon_process_duration: ReconcileProcessDuration,
222
+ ):
223
+ recon_capture.start(
224
+ data_reconcile_output=data_reconcile_output,
225
+ schema_reconcile_output=schema_reconcile_output,
226
+ table_conf=table_conf,
227
+ recon_process_duration=recon_process_duration,
228
+ record_count=reconciler.get_record_count(table_conf, reconciler.report_type),
229
+ )
230
+ if reconciler.report_type != "schema":
231
+ ReconIntermediatePersist(
232
+ spark=spark, path=utils.generate_volume_path(table_conf, reconcile_config.metadata_config)
233
+ ).clean_unmatched_df_from_volume()
234
+
235
+ @staticmethod
236
+ def verify_successful_reconciliation(
237
+ reconcile_output: ReconcileOutput, operation_name: str = "reconcile"
238
+ ) -> ReconcileOutput:
239
+ for table_output in reconcile_output.results:
240
+ if table_output.exception_message or (
241
+ table_output.status.column is False
242
+ or table_output.status.row is False
243
+ or table_output.status.schema is False
244
+ or table_output.status.aggregate is False
245
+ ):
246
+ raise ReconciliationException(
247
+ f" Reconciliation failed for one or more tables. Please check the recon metrics for more details."
248
+ f" **{operation_name}** failed.",
249
+ reconcile_output=reconcile_output,
250
+ )
251
+
252
+ logger.info("Reconciliation completed successfully.")
253
+ return reconcile_output
@@ -0,0 +1,38 @@
1
+ import logging
2
+
3
+ from pyspark.sql import SparkSession
4
+
5
+ from databricks.sdk import WorkspaceClient
6
+
7
+ from databricks.labs.lakebridge.config import ReconcileMetadataConfig
8
+ from databricks.labs.lakebridge.reconcile.connectors.source_adapter import create_adapter
9
+ from databricks.labs.lakebridge.reconcile.exception import InvalidInputException
10
+ from databricks.labs.lakebridge.reconcile.recon_config import Table
11
+ from databricks.labs.lakebridge.transpiler.sqlglot.dialect_utils import get_dialect
12
+
13
+ logger = logging.getLogger(__name__)
14
+
15
+
16
+ def initialise_data_source(
17
+ ws: WorkspaceClient,
18
+ spark: SparkSession,
19
+ engine: str,
20
+ secret_scope: str,
21
+ ):
22
+ source = create_adapter(engine=get_dialect(engine), spark=spark, ws=ws, secret_scope=secret_scope)
23
+ target = create_adapter(engine=get_dialect("databricks"), spark=spark, ws=ws, secret_scope=secret_scope)
24
+
25
+ return source, target
26
+
27
+
28
+ def validate_input(input_value: str, list_of_value: set, message: str):
29
+ if input_value not in list_of_value:
30
+ error_message = f"{message} --> {input_value} is not one of {list_of_value}"
31
+ logger.error(error_message)
32
+ raise InvalidInputException(error_message)
33
+
34
+
35
+ def generate_volume_path(table_conf: Table, metadata_config: ReconcileMetadataConfig):
36
+ catalog = metadata_config.catalog
37
+ schema = metadata_config.schema
38
+ return f"/Volumes/{catalog}/{schema}/{metadata_config.volume}/{table_conf.source_name}_{table_conf.target_name}/"
@@ -4,7 +4,9 @@ import abc
4
4
  import asyncio
5
5
  import logging
6
6
  import os
7
+ import shutil
7
8
  import sys
9
+ import venv
8
10
  from collections.abc import Callable, Sequence, Mapping
9
11
  from dataclasses import dataclass
10
12
  from pathlib import Path
@@ -35,7 +37,7 @@ from pygls.lsp.client import BaseLanguageClient
35
37
  from databricks.labs.blueprint.wheels import ProductInfo
36
38
  from databricks.labs.lakebridge.config import LSPConfigOptionV1, TranspileConfig, TranspileResult
37
39
  from databricks.labs.lakebridge.errors.exceptions import IllegalStateException
38
- from databricks.labs.lakebridge.helpers.file_utils import chdir, is_dbt_project_file, is_sql_file
40
+ from databricks.labs.lakebridge.helpers.file_utils import is_dbt_project_file, is_sql_file
39
41
  from databricks.labs.lakebridge.transpiler.transpile_engine import TranspileEngine
40
42
  from databricks.labs.lakebridge.transpiler.transpile_status import (
41
43
  CodePosition,
@@ -409,9 +411,7 @@ class LSPEngine(TranspileEngine):
409
411
  if self.is_alive:
410
412
  raise IllegalStateException("LSP engine is already initialized")
411
413
  try:
412
- # TODO: Avoid this by setting the working directory when launching the child process.
413
- with chdir(self._workdir):
414
- await self._do_initialize(config)
414
+ await self._do_initialize(config)
415
415
  await self._await_for_transpile_capability()
416
416
  # it is good practice to catch broad exceptions raised by launching a child process
417
417
  except Exception as e: # pylint: disable=broad-exception-caught
@@ -432,65 +432,50 @@ class LSPEngine(TranspileEngine):
432
432
  logger.debug(f"LSP init params: {params}")
433
433
  self._init_response = await self._client.initialize_async(params)
434
434
 
435
- async def _start_server(self):
436
- executable = self._config.remorph.command_line[0]
437
- if executable in {"python", "python3"}:
438
- await self._start_python_server()
439
- else:
440
- await self._start_other_server()
441
-
442
- async def _start_python_server(self):
443
- has_venv = (self._workdir / ".venv").exists()
444
- if has_venv:
445
- await self._start_python_server_with_venv()
446
- else:
447
- await self._start_python_server_without_venv()
448
-
449
- async def _start_python_server_with_venv(self):
450
- env: dict[str, str] = os.environ | self._config.remorph.env_vars
451
- # ensure modules are searched within venv
452
- if "PYTHONPATH" in env.keys():
453
- del env["PYTHONPATH"]
454
- if "VIRTUAL_ENV" in env.keys():
455
- del env["VIRTUAL_ENV"]
456
- if "VIRTUAL_ENV_PROMPT" in env.keys():
457
- del env["VIRTUAL_ENV_PROMPT"]
458
- path = self._workdir / ".venv" / "Scripts" if sys.platform == "win32" else self._workdir / ".venv" / "bin"
459
- if "PATH" in env.keys():
460
- env["PATH"] = str(path) + os.pathsep + env["PATH"]
461
- else:
462
- env["PATH"] = str(path)
463
- python = "python.exe" if sys.platform == "win32" else "python3"
464
- executable = path / python
465
- await self._launch_executable(executable, env)
466
-
467
- async def _start_python_server_without_venv(self):
468
- env: dict[str, str] = os.environ | self._config.remorph.env_vars
469
- # ensure modules are searched locally before being searched in remorph
470
- if "PYTHONPATH" in env.keys():
471
- env["PYTHONPATH"] = str(self._workdir) + os.pathsep + env["PYTHONPATH"]
472
- else:
473
- env["PYTHONPATH"] = str(self._workdir)
474
- executable = Path(self._config.remorph.command_line[0])
475
- await self._launch_executable(executable, env)
435
+ async def _start_server(self) -> None:
436
+ # Sanity-check and split the command-line into components.
437
+ if not (command_line := self._config.remorph.command_line):
438
+ raise ValueError(f"Missing command line for LSP server: {self._config.path}")
439
+ executable, *args = command_line
476
440
 
477
- async def _start_other_server(self):
441
+ # Extract the environment, preparing to ensure that PATH is set correctly.
478
442
  env: dict[str, str] = os.environ | self._config.remorph.env_vars
479
- # ensure modules are searched within venv
480
- if "PYTHONPATH" in env.keys():
481
- del env["PYTHONPATH"]
482
- if "VIRTUAL_ENV" in env.keys():
483
- del env["VIRTUAL_ENV"]
484
- if "VIRTUAL_ENV_PROMPT" in env.keys():
485
- del env["VIRTUAL_ENV_PROMPT"]
486
- executable = Path(self._config.remorph.command_line[0])
487
- await self._launch_executable(executable, env)
488
-
489
- async def _launch_executable(self, executable: Path, env: Mapping):
443
+ path = env.get("PATH", os.defpath)
444
+
445
+ # If we have a virtual environment, ensure the bin directory is first on the PATH. This normally takes
446
+ # care of python executables, but also deals with any entry-points that the LSP server might install.
447
+ if (venv_path := self._workdir / ".venv").exists():
448
+ executable, additional_path = self._activate_venv(venv_path, executable)
449
+ # Ensure PATH is in sync with the search path we will use to locate the LSP server executable.
450
+ env["PATH"] = path = f"{additional_path}{os.pathsep}{path}"
451
+ logger.debug(f"Using PATH for launching LSP server: {path}")
452
+
453
+ # Locate the LSP server executable in a platform-independent way.
454
+ # Reference: https://docs.python.org/3/library/subprocess.html#popen-constructor
455
+ executable = shutil.which(executable, path=path) or executable
456
+
457
+ await self._launch_executable(executable, args, env)
458
+
459
+ @staticmethod
460
+ def _activate_venv(venv_path: Path, executable: str) -> tuple[str, Path]:
461
+ """Obtain the bin/script directory for the virtual environment, to extend the search path."""
462
+ logger.debug(f"Detected virtual environment to use at: {venv_path}")
463
+ use_symlinks = sys.platform != "win32"
464
+ builder = venv.EnvBuilder(symlinks=use_symlinks)
465
+ context = builder.ensure_directories(venv_path)
466
+
467
+ # Workaround for Windows, where bin_path (Scripts/) doesn't contain python3.exe: if the executable is python
468
+ # or python3, we substitute it for what is needed to launch the venv's python interpreter.
469
+ if os.path.normcase(executable) in {"python", "python3"}:
470
+ executable = context.env_exec_cmd
471
+
472
+ return executable, context.bin_path
473
+
474
+ async def _launch_executable(self, executable: str, args: Sequence[str], env: Mapping[str, str]) -> None:
490
475
  log_level = logging.getLevelName(logging.getLogger("databricks").level)
491
- args = self._config.remorph.command_line[1:] + [f"--log_level={log_level}"]
492
- logger.debug(f"Starting LSP engine: {executable} {args} (cwd={os.getcwd()})")
493
- await self._client.start_io(str(executable), env=env, *args)
476
+ args = [*args, f"--log_level={log_level}"]
477
+ logger.debug(f"Starting LSP engine: {executable} {args} (cwd={self._workdir})")
478
+ await self._client.start_io(executable, *args, env=env, cwd=self._workdir)
494
479
 
495
480
  def _client_capabilities(self):
496
481
  return ClientCapabilities() # TODO do we need to refine this ?
@@ -18,6 +18,8 @@ SQLGLOT_DIALECTS: dict[str, type[Dialect] | str] = {
18
18
  "teradata": Dialects.TERADATA,
19
19
  "trino": Dialects.TRINO,
20
20
  "tsql": Dialects.TSQL,
21
+ "mssql": Dialects.TSQL,
22
+ "synapse": Dialects.TSQL,
21
23
  "vertica": Dialects.POSTGRES,
22
24
  }
23
25
 
@@ -6,24 +6,6 @@ from databricks.labs.lakebridge.config import TranspileResult, TranspileConfig
6
6
 
7
7
 
8
8
  class TranspileEngine(abc.ABC):
9
-
10
- @classmethod
11
- def load_engine(cls, transpiler_config_path: Path) -> TranspileEngine:
12
- # TODO remove this once sqlglot transpiler is pluggable
13
- if str(transpiler_config_path) == "sqlglot":
14
- # pylint: disable=import-outside-toplevel, cyclic-import
15
- from databricks.labs.lakebridge.transpiler.sqlglot.sqlglot_engine import SqlglotEngine
16
-
17
- return SqlglotEngine()
18
- if not transpiler_config_path.exists():
19
- raise ValueError(
20
- f"Error: Invalid value for '--transpiler-config-path': '{str(transpiler_config_path)}', file does not exist."
21
- )
22
- # pylint: disable=import-outside-toplevel, cyclic-import
23
- from databricks.labs.lakebridge.transpiler.lsp.lsp_engine import LSPEngine
24
-
25
- return LSPEngine.from_config_path(transpiler_config_path)
26
-
27
9
  @abc.abstractmethod
28
10
  async def initialize(self, config: TranspileConfig) -> None: ...
29
11
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: databricks-labs-lakebridge
3
- Version: 0.10.6
3
+ Version: 0.10.7
4
4
  Summary: Fast and predictable migrations to Databricks Lakehouse Platform. This tool is designed to help you migrate your data and workloads to the Databricks Lakehouse Platform in a fast, predictable, and reliable way. It provides a set of tools and utilities to help you reconcile your data and workloads, assess your current state, and plan your migration.
5
5
  Project-URL: Documentation, https://databrickslabs.github.io/lakebridge
6
6
  Project-URL: Issues, https://github.com/databrickslabs/lakebridge/issues