databricks-labs-lakebridge 0.10.5__py3-none-any.whl → 0.10.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- databricks/labs/lakebridge/__about__.py +1 -1
- databricks/labs/lakebridge/analyzer/__init__.py +0 -0
- databricks/labs/lakebridge/analyzer/lakebridge_analyzer.py +95 -0
- databricks/labs/lakebridge/base_install.py +24 -3
- databricks/labs/lakebridge/cli.py +57 -72
- databricks/labs/lakebridge/config.py +1 -1
- databricks/labs/lakebridge/contexts/application.py +11 -4
- databricks/labs/lakebridge/deployment/dashboard.py +2 -1
- databricks/labs/lakebridge/deployment/installation.py +11 -11
- databricks/labs/lakebridge/deployment/job.py +2 -2
- databricks/labs/lakebridge/helpers/file_utils.py +36 -0
- databricks/labs/lakebridge/install.py +228 -278
- databricks/labs/lakebridge/reconcile/compare.py +70 -33
- databricks/labs/lakebridge/reconcile/connectors/data_source.py +19 -0
- databricks/labs/lakebridge/reconcile/connectors/databricks.py +11 -1
- databricks/labs/lakebridge/reconcile/connectors/dialect_utils.py +126 -0
- databricks/labs/lakebridge/reconcile/connectors/models.py +7 -0
- databricks/labs/lakebridge/reconcile/connectors/oracle.py +11 -1
- databricks/labs/lakebridge/reconcile/connectors/snowflake.py +14 -2
- databricks/labs/lakebridge/reconcile/connectors/tsql.py +27 -2
- databricks/labs/lakebridge/reconcile/constants.py +4 -3
- databricks/labs/lakebridge/reconcile/execute.py +9 -810
- databricks/labs/lakebridge/reconcile/normalize_recon_config_service.py +133 -0
- databricks/labs/lakebridge/reconcile/query_builder/base.py +3 -7
- databricks/labs/lakebridge/reconcile/recon_config.py +3 -0
- databricks/labs/lakebridge/reconcile/recon_output_config.py +2 -1
- databricks/labs/lakebridge/reconcile/reconciliation.py +508 -0
- databricks/labs/lakebridge/reconcile/schema_compare.py +26 -19
- databricks/labs/lakebridge/reconcile/trigger_recon_aggregate_service.py +98 -0
- databricks/labs/lakebridge/reconcile/trigger_recon_service.py +253 -0
- databricks/labs/lakebridge/reconcile/utils.py +38 -0
- databricks/labs/lakebridge/transpiler/lsp/lsp_engine.py +48 -63
- databricks/labs/lakebridge/transpiler/repository.py +123 -0
- databricks/labs/lakebridge/transpiler/sqlglot/dialect_utils.py +2 -0
- databricks/labs/lakebridge/transpiler/transpile_engine.py +0 -18
- {databricks_labs_lakebridge-0.10.5.dist-info → databricks_labs_lakebridge-0.10.7.dist-info}/METADATA +1 -1
- {databricks_labs_lakebridge-0.10.5.dist-info → databricks_labs_lakebridge-0.10.7.dist-info}/RECORD +41 -31
- {databricks_labs_lakebridge-0.10.5.dist-info → databricks_labs_lakebridge-0.10.7.dist-info}/WHEEL +0 -0
- {databricks_labs_lakebridge-0.10.5.dist-info → databricks_labs_lakebridge-0.10.7.dist-info}/entry_points.txt +0 -0
- {databricks_labs_lakebridge-0.10.5.dist-info → databricks_labs_lakebridge-0.10.7.dist-info}/licenses/LICENSE +0 -0
- {databricks_labs_lakebridge-0.10.5.dist-info → databricks_labs_lakebridge-0.10.7.dist-info}/licenses/NOTICE +0 -0
@@ -1,10 +1,10 @@
|
|
1
1
|
import logging
|
2
|
-
from dataclasses import asdict
|
3
2
|
|
4
3
|
from pyspark.sql import DataFrame, SparkSession
|
5
4
|
from pyspark.sql.types import BooleanType, StringType, StructField, StructType
|
6
5
|
from sqlglot import Dialect, parse_one
|
7
6
|
|
7
|
+
from databricks.labs.lakebridge.reconcile.connectors.dialect_utils import DialectUtils
|
8
8
|
from databricks.labs.lakebridge.transpiler.sqlglot.dialect_utils import get_dialect
|
9
9
|
from databricks.labs.lakebridge.reconcile.recon_config import Schema, Table
|
10
10
|
from databricks.labs.lakebridge.reconcile.recon_output_config import SchemaMatchResult, SchemaReconcileOutput
|
@@ -20,8 +20,7 @@ class SchemaCompare:
|
|
20
20
|
):
|
21
21
|
self.spark = spark
|
22
22
|
|
23
|
-
|
24
|
-
_schema_compare_schema: StructType = StructType(
|
23
|
+
_schema_compare_output_schema: StructType = StructType(
|
25
24
|
[
|
26
25
|
StructField("source_column", StringType(), False),
|
27
26
|
StructField("source_datatype", StringType(), False),
|
@@ -47,14 +46,16 @@ class SchemaCompare:
|
|
47
46
|
target_column_map = table_conf.to_src_col_map or {}
|
48
47
|
master_schema_match_res = [
|
49
48
|
SchemaMatchResult(
|
50
|
-
|
51
|
-
|
49
|
+
source_column_normalized=s.source_normalized_column_name,
|
50
|
+
source_column_normalized_ansi=s.ansi_normalized_column_name,
|
52
51
|
source_datatype=s.data_type,
|
52
|
+
databricks_column=target_column_map.get(s.ansi_normalized_column_name, s.ansi_normalized_column_name),
|
53
53
|
databricks_datatype=next(
|
54
54
|
(
|
55
55
|
tgt.data_type
|
56
56
|
for tgt in databricks_schema
|
57
|
-
if tgt.
|
57
|
+
if tgt.ansi_normalized_column_name
|
58
|
+
== target_column_map.get(s.ansi_normalized_column_name, s.ansi_normalized_column_name)
|
58
59
|
),
|
59
60
|
"",
|
60
61
|
),
|
@@ -63,16 +64,22 @@ class SchemaCompare:
|
|
63
64
|
]
|
64
65
|
return master_schema_match_res
|
65
66
|
|
66
|
-
def
|
67
|
-
"""
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
67
|
+
def _create_output_dataframe(self, data: list[SchemaMatchResult], schema: StructType) -> DataFrame:
|
68
|
+
"""Return a user-friendly dataframe for schema compare result."""
|
69
|
+
transformed = []
|
70
|
+
for item in data:
|
71
|
+
output = tuple(
|
72
|
+
[
|
73
|
+
DialectUtils.unnormalize_identifier(item.source_column_normalized_ansi),
|
74
|
+
item.source_datatype,
|
75
|
+
DialectUtils.unnormalize_identifier(item.databricks_column),
|
76
|
+
item.databricks_datatype,
|
77
|
+
item.is_valid,
|
78
|
+
]
|
79
|
+
)
|
80
|
+
transformed.append(output)
|
74
81
|
|
75
|
-
return
|
82
|
+
return self.spark.createDataFrame(transformed, schema)
|
76
83
|
|
77
84
|
@classmethod
|
78
85
|
def _parse(cls, source: Dialect, column: str, data_type: str) -> str:
|
@@ -88,10 +95,10 @@ class SchemaCompare:
|
|
88
95
|
|
89
96
|
@classmethod
|
90
97
|
def _validate_parsed_query(cls, master: SchemaMatchResult, parsed_query) -> None:
|
91
|
-
databricks_query = f"create table dummy ({master.
|
98
|
+
databricks_query = f"create table dummy ({master.source_column_normalized_ansi} {master.databricks_datatype})"
|
92
99
|
logger.info(
|
93
100
|
f"""
|
94
|
-
Source datatype: create table dummy ({master.
|
101
|
+
Source datatype: create table dummy ({master.source_column_normalized} {master.source_datatype})
|
95
102
|
Parse datatype: {parsed_query}
|
96
103
|
Databricks datatype: {databricks_query}
|
97
104
|
"""
|
@@ -116,11 +123,11 @@ class SchemaCompare:
|
|
116
123
|
master_schema = self._build_master_schema(source_schema, databricks_schema, table_conf)
|
117
124
|
for master in master_schema:
|
118
125
|
if not isinstance(source, Databricks):
|
119
|
-
parsed_query = self._parse(source, master.
|
126
|
+
parsed_query = self._parse(source, master.source_column_normalized, master.source_datatype)
|
120
127
|
self._validate_parsed_query(master, parsed_query)
|
121
128
|
elif master.source_datatype.lower() != master.databricks_datatype.lower():
|
122
129
|
master.is_valid = False
|
123
130
|
|
124
|
-
df = self.
|
131
|
+
df = self._create_output_dataframe(master_schema, self._schema_compare_output_schema)
|
125
132
|
final_result = self._table_schema_status(master_schema)
|
126
133
|
return SchemaReconcileOutput(final_result, df)
|
@@ -0,0 +1,98 @@
|
|
1
|
+
from datetime import datetime
|
2
|
+
|
3
|
+
from pyspark.sql import SparkSession
|
4
|
+
from databricks.sdk import WorkspaceClient
|
5
|
+
|
6
|
+
from databricks.labs.lakebridge.config import ReconcileConfig, TableRecon
|
7
|
+
from databricks.labs.lakebridge.reconcile import utils
|
8
|
+
from databricks.labs.lakebridge.reconcile.exception import DataSourceRuntimeException, ReconciliationException
|
9
|
+
from databricks.labs.lakebridge.reconcile.recon_capture import (
|
10
|
+
ReconIntermediatePersist,
|
11
|
+
generate_final_reconcile_aggregate_output,
|
12
|
+
)
|
13
|
+
from databricks.labs.lakebridge.reconcile.recon_config import Table, Schema, AGG_RECONCILE_OPERATION_NAME
|
14
|
+
from databricks.labs.lakebridge.reconcile.recon_output_config import (
|
15
|
+
ReconcileProcessDuration,
|
16
|
+
AggregateQueryOutput,
|
17
|
+
DataReconcileOutput,
|
18
|
+
)
|
19
|
+
from databricks.labs.lakebridge.reconcile.reconciliation import Reconciliation
|
20
|
+
from databricks.labs.lakebridge.reconcile.trigger_recon_service import TriggerReconService
|
21
|
+
from databricks.labs.lakebridge.reconcile.normalize_recon_config_service import NormalizeReconConfigService
|
22
|
+
|
23
|
+
|
24
|
+
class TriggerReconAggregateService:
|
25
|
+
@staticmethod
|
26
|
+
def trigger_recon_aggregates(
|
27
|
+
ws: WorkspaceClient,
|
28
|
+
spark: SparkSession,
|
29
|
+
table_recon: TableRecon,
|
30
|
+
reconcile_config: ReconcileConfig,
|
31
|
+
local_test_run: bool = False,
|
32
|
+
):
|
33
|
+
reconciler, recon_capture = TriggerReconService.create_recon_dependencies(
|
34
|
+
ws, spark, reconcile_config, local_test_run
|
35
|
+
)
|
36
|
+
|
37
|
+
# Get the Aggregated Reconciliation Output for each table
|
38
|
+
for table_conf in table_recon.tables:
|
39
|
+
normalized_table_conf = NormalizeReconConfigService(
|
40
|
+
reconciler.source, reconciler.target
|
41
|
+
).normalize_recon_table_config(table_conf)
|
42
|
+
|
43
|
+
recon_process_duration = ReconcileProcessDuration(start_ts=str(datetime.now()), end_ts=None)
|
44
|
+
try:
|
45
|
+
src_schema, tgt_schema = TriggerReconService.get_schemas(
|
46
|
+
reconciler.source, reconciler.target, normalized_table_conf, reconcile_config.database_config
|
47
|
+
)
|
48
|
+
except DataSourceRuntimeException as e:
|
49
|
+
raise ReconciliationException(message=str(e)) from e
|
50
|
+
|
51
|
+
assert normalized_table_conf.aggregates, "Aggregates must be defined for Aggregates Reconciliation"
|
52
|
+
|
53
|
+
table_reconcile_agg_output_list: list[AggregateQueryOutput] = (
|
54
|
+
TriggerReconAggregateService._run_reconcile_aggregates(
|
55
|
+
reconciler=reconciler,
|
56
|
+
table_conf=normalized_table_conf,
|
57
|
+
src_schema=src_schema,
|
58
|
+
tgt_schema=tgt_schema,
|
59
|
+
)
|
60
|
+
)
|
61
|
+
|
62
|
+
recon_process_duration.end_ts = str(datetime.now())
|
63
|
+
|
64
|
+
# Persist the data to the delta tables
|
65
|
+
recon_capture.store_aggregates_metrics(
|
66
|
+
reconcile_agg_output_list=table_reconcile_agg_output_list,
|
67
|
+
table_conf=normalized_table_conf,
|
68
|
+
recon_process_duration=recon_process_duration,
|
69
|
+
)
|
70
|
+
|
71
|
+
(
|
72
|
+
ReconIntermediatePersist(
|
73
|
+
spark=spark,
|
74
|
+
path=utils.generate_volume_path(normalized_table_conf, reconcile_config.metadata_config),
|
75
|
+
).clean_unmatched_df_from_volume()
|
76
|
+
)
|
77
|
+
|
78
|
+
return TriggerReconService.verify_successful_reconciliation(
|
79
|
+
generate_final_reconcile_aggregate_output(
|
80
|
+
recon_id=recon_capture.recon_id,
|
81
|
+
spark=spark,
|
82
|
+
metadata_config=reconcile_config.metadata_config,
|
83
|
+
local_test_run=local_test_run,
|
84
|
+
),
|
85
|
+
operation_name=AGG_RECONCILE_OPERATION_NAME,
|
86
|
+
)
|
87
|
+
|
88
|
+
@staticmethod
|
89
|
+
def _run_reconcile_aggregates(
|
90
|
+
reconciler: Reconciliation,
|
91
|
+
table_conf: Table,
|
92
|
+
src_schema: list[Schema],
|
93
|
+
tgt_schema: list[Schema],
|
94
|
+
) -> list[AggregateQueryOutput]:
|
95
|
+
try:
|
96
|
+
return reconciler.reconcile_aggregates(table_conf, src_schema, tgt_schema)
|
97
|
+
except DataSourceRuntimeException as e:
|
98
|
+
return [AggregateQueryOutput(reconcile_output=DataReconcileOutput(exception=str(e)), rule=None)]
|
@@ -0,0 +1,253 @@
|
|
1
|
+
import logging
|
2
|
+
from datetime import datetime
|
3
|
+
from uuid import uuid4
|
4
|
+
|
5
|
+
from pyspark.errors import PySparkException
|
6
|
+
from pyspark.sql import SparkSession
|
7
|
+
|
8
|
+
from databricks.sdk import WorkspaceClient
|
9
|
+
|
10
|
+
from databricks.labs.lakebridge.config import ReconcileConfig, TableRecon, DatabaseConfig
|
11
|
+
from databricks.labs.lakebridge.reconcile import utils
|
12
|
+
from databricks.labs.lakebridge.reconcile.connectors.data_source import DataSource
|
13
|
+
from databricks.labs.lakebridge.reconcile.exception import DataSourceRuntimeException, ReconciliationException
|
14
|
+
from databricks.labs.lakebridge.reconcile.recon_capture import (
|
15
|
+
ReconCapture,
|
16
|
+
ReconIntermediatePersist,
|
17
|
+
generate_final_reconcile_output,
|
18
|
+
)
|
19
|
+
from databricks.labs.lakebridge.reconcile.recon_config import Table, Schema
|
20
|
+
from databricks.labs.lakebridge.reconcile.recon_output_config import (
|
21
|
+
ReconcileOutput,
|
22
|
+
ReconcileProcessDuration,
|
23
|
+
SchemaReconcileOutput,
|
24
|
+
DataReconcileOutput,
|
25
|
+
)
|
26
|
+
from databricks.labs.lakebridge.reconcile.reconciliation import Reconciliation
|
27
|
+
from databricks.labs.lakebridge.reconcile.schema_compare import SchemaCompare
|
28
|
+
from databricks.labs.lakebridge.reconcile.normalize_recon_config_service import NormalizeReconConfigService
|
29
|
+
from databricks.labs.lakebridge.transpiler.execute import verify_workspace_client
|
30
|
+
from databricks.labs.lakebridge.transpiler.sqlglot.dialect_utils import get_dialect
|
31
|
+
|
32
|
+
logger = logging.getLogger(__name__)
|
33
|
+
_RECON_REPORT_TYPES = {"schema", "data", "row", "all", "aggregate"}
|
34
|
+
|
35
|
+
|
36
|
+
class TriggerReconService:
|
37
|
+
|
38
|
+
@staticmethod
|
39
|
+
def trigger_recon(
|
40
|
+
ws: WorkspaceClient,
|
41
|
+
spark: SparkSession,
|
42
|
+
table_recon: TableRecon,
|
43
|
+
reconcile_config: ReconcileConfig,
|
44
|
+
local_test_run: bool = False,
|
45
|
+
) -> ReconcileOutput:
|
46
|
+
reconciler, recon_capture = TriggerReconService.create_recon_dependencies(
|
47
|
+
ws, spark, reconcile_config, local_test_run
|
48
|
+
)
|
49
|
+
|
50
|
+
for table_conf in table_recon.tables:
|
51
|
+
TriggerReconService.recon_one(spark, reconciler, recon_capture, reconcile_config, table_conf)
|
52
|
+
|
53
|
+
return TriggerReconService.verify_successful_reconciliation(
|
54
|
+
generate_final_reconcile_output(
|
55
|
+
recon_id=recon_capture.recon_id,
|
56
|
+
spark=spark,
|
57
|
+
metadata_config=reconcile_config.metadata_config,
|
58
|
+
local_test_run=local_test_run,
|
59
|
+
)
|
60
|
+
)
|
61
|
+
|
62
|
+
@staticmethod
|
63
|
+
def create_recon_dependencies(
|
64
|
+
ws: WorkspaceClient, spark: SparkSession, reconcile_config: ReconcileConfig, local_test_run: bool = False
|
65
|
+
) -> tuple[Reconciliation, ReconCapture]:
|
66
|
+
ws_client: WorkspaceClient = verify_workspace_client(ws)
|
67
|
+
|
68
|
+
# validate the report type
|
69
|
+
report_type = reconcile_config.report_type.lower()
|
70
|
+
logger.info(f"report_type: {report_type}, data_source: {reconcile_config.data_source} ")
|
71
|
+
utils.validate_input(report_type, _RECON_REPORT_TYPES, "Invalid report type")
|
72
|
+
|
73
|
+
source, target = utils.initialise_data_source(
|
74
|
+
engine=reconcile_config.data_source,
|
75
|
+
spark=spark,
|
76
|
+
ws=ws_client,
|
77
|
+
secret_scope=reconcile_config.secret_scope,
|
78
|
+
)
|
79
|
+
|
80
|
+
recon_id = str(uuid4())
|
81
|
+
# initialise the Reconciliation
|
82
|
+
reconciler = Reconciliation(
|
83
|
+
source,
|
84
|
+
target,
|
85
|
+
reconcile_config.database_config,
|
86
|
+
report_type,
|
87
|
+
SchemaCompare(spark=spark),
|
88
|
+
get_dialect(reconcile_config.data_source),
|
89
|
+
spark,
|
90
|
+
metadata_config=reconcile_config.metadata_config,
|
91
|
+
)
|
92
|
+
|
93
|
+
recon_capture = ReconCapture(
|
94
|
+
database_config=reconcile_config.database_config,
|
95
|
+
recon_id=recon_id,
|
96
|
+
report_type=report_type,
|
97
|
+
source_dialect=get_dialect(reconcile_config.data_source),
|
98
|
+
ws=ws_client,
|
99
|
+
spark=spark,
|
100
|
+
metadata_config=reconcile_config.metadata_config,
|
101
|
+
local_test_run=local_test_run,
|
102
|
+
)
|
103
|
+
|
104
|
+
return reconciler, recon_capture
|
105
|
+
|
106
|
+
@staticmethod
|
107
|
+
def recon_one(
|
108
|
+
spark: SparkSession,
|
109
|
+
reconciler: Reconciliation,
|
110
|
+
recon_capture: ReconCapture,
|
111
|
+
reconcile_config: ReconcileConfig,
|
112
|
+
table_conf: Table,
|
113
|
+
):
|
114
|
+
normalized_table_conf = NormalizeReconConfigService(
|
115
|
+
reconciler.source, reconciler.target
|
116
|
+
).normalize_recon_table_config(table_conf)
|
117
|
+
|
118
|
+
schema_reconcile_output, data_reconcile_output, recon_process_duration = TriggerReconService._do_recon_one(
|
119
|
+
reconciler, reconcile_config, normalized_table_conf
|
120
|
+
)
|
121
|
+
|
122
|
+
TriggerReconService.persist_delta_table(
|
123
|
+
spark,
|
124
|
+
reconciler,
|
125
|
+
recon_capture,
|
126
|
+
schema_reconcile_output,
|
127
|
+
data_reconcile_output,
|
128
|
+
reconcile_config,
|
129
|
+
normalized_table_conf,
|
130
|
+
recon_process_duration,
|
131
|
+
)
|
132
|
+
|
133
|
+
@staticmethod
|
134
|
+
def _do_recon_one(reconciler: Reconciliation, reconcile_config: ReconcileConfig, table_conf: Table):
|
135
|
+
recon_process_duration = ReconcileProcessDuration(start_ts=str(datetime.now()), end_ts=None)
|
136
|
+
schema_reconcile_output = SchemaReconcileOutput(is_valid=True)
|
137
|
+
data_reconcile_output = DataReconcileOutput()
|
138
|
+
|
139
|
+
try:
|
140
|
+
src_schema, tgt_schema = TriggerReconService.get_schemas(
|
141
|
+
reconciler.source, reconciler.target, table_conf, reconcile_config.database_config
|
142
|
+
)
|
143
|
+
except DataSourceRuntimeException as e:
|
144
|
+
schema_reconcile_output = SchemaReconcileOutput(is_valid=False, exception=str(e))
|
145
|
+
else:
|
146
|
+
if reconciler.report_type in {"schema", "all"}:
|
147
|
+
schema_reconcile_output = TriggerReconService._run_reconcile_schema(
|
148
|
+
reconciler=reconciler,
|
149
|
+
table_conf=table_conf,
|
150
|
+
src_schema=src_schema,
|
151
|
+
tgt_schema=tgt_schema,
|
152
|
+
)
|
153
|
+
logger.warning("Schema comparison is completed.")
|
154
|
+
|
155
|
+
if reconciler.report_type in {"data", "row", "all"}:
|
156
|
+
data_reconcile_output = TriggerReconService._run_reconcile_data(
|
157
|
+
reconciler=reconciler,
|
158
|
+
table_conf=table_conf,
|
159
|
+
src_schema=src_schema,
|
160
|
+
tgt_schema=tgt_schema,
|
161
|
+
)
|
162
|
+
logger.warning(f"Reconciliation for '{reconciler.report_type}' report completed.")
|
163
|
+
|
164
|
+
recon_process_duration.end_ts = str(datetime.now())
|
165
|
+
return schema_reconcile_output, data_reconcile_output, recon_process_duration
|
166
|
+
|
167
|
+
@staticmethod
|
168
|
+
def get_schemas(
|
169
|
+
source: DataSource,
|
170
|
+
target: DataSource,
|
171
|
+
table_conf: Table,
|
172
|
+
database_config: DatabaseConfig,
|
173
|
+
) -> tuple[list[Schema], list[Schema]]:
|
174
|
+
src_schema = source.get_schema(
|
175
|
+
catalog=database_config.source_catalog,
|
176
|
+
schema=database_config.source_schema,
|
177
|
+
table=table_conf.source_name,
|
178
|
+
)
|
179
|
+
|
180
|
+
tgt_schema = target.get_schema(
|
181
|
+
catalog=database_config.target_catalog,
|
182
|
+
schema=database_config.target_schema,
|
183
|
+
table=table_conf.target_name,
|
184
|
+
)
|
185
|
+
|
186
|
+
return src_schema, tgt_schema
|
187
|
+
|
188
|
+
@staticmethod
|
189
|
+
def _run_reconcile_schema(
|
190
|
+
reconciler: Reconciliation,
|
191
|
+
table_conf: Table,
|
192
|
+
src_schema: list[Schema],
|
193
|
+
tgt_schema: list[Schema],
|
194
|
+
):
|
195
|
+
try:
|
196
|
+
return reconciler.reconcile_schema(table_conf=table_conf, src_schema=src_schema, tgt_schema=tgt_schema)
|
197
|
+
except PySparkException as e:
|
198
|
+
return SchemaReconcileOutput(is_valid=False, exception=str(e))
|
199
|
+
|
200
|
+
@staticmethod
|
201
|
+
def _run_reconcile_data(
|
202
|
+
reconciler: Reconciliation,
|
203
|
+
table_conf: Table,
|
204
|
+
src_schema: list[Schema],
|
205
|
+
tgt_schema: list[Schema],
|
206
|
+
) -> DataReconcileOutput:
|
207
|
+
try:
|
208
|
+
return reconciler.reconcile_data(table_conf=table_conf, src_schema=src_schema, tgt_schema=tgt_schema)
|
209
|
+
except DataSourceRuntimeException as e:
|
210
|
+
return DataReconcileOutput(exception=str(e))
|
211
|
+
|
212
|
+
@staticmethod
|
213
|
+
def persist_delta_table(
|
214
|
+
spark: SparkSession,
|
215
|
+
reconciler: Reconciliation,
|
216
|
+
recon_capture: ReconCapture,
|
217
|
+
schema_reconcile_output: SchemaReconcileOutput,
|
218
|
+
data_reconcile_output: DataReconcileOutput,
|
219
|
+
reconcile_config: ReconcileConfig,
|
220
|
+
table_conf: Table,
|
221
|
+
recon_process_duration: ReconcileProcessDuration,
|
222
|
+
):
|
223
|
+
recon_capture.start(
|
224
|
+
data_reconcile_output=data_reconcile_output,
|
225
|
+
schema_reconcile_output=schema_reconcile_output,
|
226
|
+
table_conf=table_conf,
|
227
|
+
recon_process_duration=recon_process_duration,
|
228
|
+
record_count=reconciler.get_record_count(table_conf, reconciler.report_type),
|
229
|
+
)
|
230
|
+
if reconciler.report_type != "schema":
|
231
|
+
ReconIntermediatePersist(
|
232
|
+
spark=spark, path=utils.generate_volume_path(table_conf, reconcile_config.metadata_config)
|
233
|
+
).clean_unmatched_df_from_volume()
|
234
|
+
|
235
|
+
@staticmethod
|
236
|
+
def verify_successful_reconciliation(
|
237
|
+
reconcile_output: ReconcileOutput, operation_name: str = "reconcile"
|
238
|
+
) -> ReconcileOutput:
|
239
|
+
for table_output in reconcile_output.results:
|
240
|
+
if table_output.exception_message or (
|
241
|
+
table_output.status.column is False
|
242
|
+
or table_output.status.row is False
|
243
|
+
or table_output.status.schema is False
|
244
|
+
or table_output.status.aggregate is False
|
245
|
+
):
|
246
|
+
raise ReconciliationException(
|
247
|
+
f" Reconciliation failed for one or more tables. Please check the recon metrics for more details."
|
248
|
+
f" **{operation_name}** failed.",
|
249
|
+
reconcile_output=reconcile_output,
|
250
|
+
)
|
251
|
+
|
252
|
+
logger.info("Reconciliation completed successfully.")
|
253
|
+
return reconcile_output
|
@@ -0,0 +1,38 @@
|
|
1
|
+
import logging
|
2
|
+
|
3
|
+
from pyspark.sql import SparkSession
|
4
|
+
|
5
|
+
from databricks.sdk import WorkspaceClient
|
6
|
+
|
7
|
+
from databricks.labs.lakebridge.config import ReconcileMetadataConfig
|
8
|
+
from databricks.labs.lakebridge.reconcile.connectors.source_adapter import create_adapter
|
9
|
+
from databricks.labs.lakebridge.reconcile.exception import InvalidInputException
|
10
|
+
from databricks.labs.lakebridge.reconcile.recon_config import Table
|
11
|
+
from databricks.labs.lakebridge.transpiler.sqlglot.dialect_utils import get_dialect
|
12
|
+
|
13
|
+
logger = logging.getLogger(__name__)
|
14
|
+
|
15
|
+
|
16
|
+
def initialise_data_source(
|
17
|
+
ws: WorkspaceClient,
|
18
|
+
spark: SparkSession,
|
19
|
+
engine: str,
|
20
|
+
secret_scope: str,
|
21
|
+
):
|
22
|
+
source = create_adapter(engine=get_dialect(engine), spark=spark, ws=ws, secret_scope=secret_scope)
|
23
|
+
target = create_adapter(engine=get_dialect("databricks"), spark=spark, ws=ws, secret_scope=secret_scope)
|
24
|
+
|
25
|
+
return source, target
|
26
|
+
|
27
|
+
|
28
|
+
def validate_input(input_value: str, list_of_value: set, message: str):
|
29
|
+
if input_value not in list_of_value:
|
30
|
+
error_message = f"{message} --> {input_value} is not one of {list_of_value}"
|
31
|
+
logger.error(error_message)
|
32
|
+
raise InvalidInputException(error_message)
|
33
|
+
|
34
|
+
|
35
|
+
def generate_volume_path(table_conf: Table, metadata_config: ReconcileMetadataConfig):
|
36
|
+
catalog = metadata_config.catalog
|
37
|
+
schema = metadata_config.schema
|
38
|
+
return f"/Volumes/{catalog}/{schema}/{metadata_config.volume}/{table_conf.source_name}_{table_conf.target_name}/"
|
@@ -4,7 +4,9 @@ import abc
|
|
4
4
|
import asyncio
|
5
5
|
import logging
|
6
6
|
import os
|
7
|
+
import shutil
|
7
8
|
import sys
|
9
|
+
import venv
|
8
10
|
from collections.abc import Callable, Sequence, Mapping
|
9
11
|
from dataclasses import dataclass
|
10
12
|
from pathlib import Path
|
@@ -35,7 +37,7 @@ from pygls.lsp.client import BaseLanguageClient
|
|
35
37
|
from databricks.labs.blueprint.wheels import ProductInfo
|
36
38
|
from databricks.labs.lakebridge.config import LSPConfigOptionV1, TranspileConfig, TranspileResult
|
37
39
|
from databricks.labs.lakebridge.errors.exceptions import IllegalStateException
|
38
|
-
from databricks.labs.lakebridge.helpers.file_utils import
|
40
|
+
from databricks.labs.lakebridge.helpers.file_utils import is_dbt_project_file, is_sql_file
|
39
41
|
from databricks.labs.lakebridge.transpiler.transpile_engine import TranspileEngine
|
40
42
|
from databricks.labs.lakebridge.transpiler.transpile_status import (
|
41
43
|
CodePosition,
|
@@ -409,9 +411,7 @@ class LSPEngine(TranspileEngine):
|
|
409
411
|
if self.is_alive:
|
410
412
|
raise IllegalStateException("LSP engine is already initialized")
|
411
413
|
try:
|
412
|
-
|
413
|
-
with chdir(self._workdir):
|
414
|
-
await self._do_initialize(config)
|
414
|
+
await self._do_initialize(config)
|
415
415
|
await self._await_for_transpile_capability()
|
416
416
|
# it is good practice to catch broad exceptions raised by launching a child process
|
417
417
|
except Exception as e: # pylint: disable=broad-exception-caught
|
@@ -432,65 +432,50 @@ class LSPEngine(TranspileEngine):
|
|
432
432
|
logger.debug(f"LSP init params: {params}")
|
433
433
|
self._init_response = await self._client.initialize_async(params)
|
434
434
|
|
435
|
-
async def _start_server(self):
|
436
|
-
|
437
|
-
if
|
438
|
-
|
439
|
-
|
440
|
-
await self._start_other_server()
|
441
|
-
|
442
|
-
async def _start_python_server(self):
|
443
|
-
has_venv = (self._workdir / ".venv").exists()
|
444
|
-
if has_venv:
|
445
|
-
await self._start_python_server_with_venv()
|
446
|
-
else:
|
447
|
-
await self._start_python_server_without_venv()
|
448
|
-
|
449
|
-
async def _start_python_server_with_venv(self):
|
450
|
-
env: dict[str, str] = os.environ | self._config.remorph.env_vars
|
451
|
-
# ensure modules are searched within venv
|
452
|
-
if "PYTHONPATH" in env.keys():
|
453
|
-
del env["PYTHONPATH"]
|
454
|
-
if "VIRTUAL_ENV" in env.keys():
|
455
|
-
del env["VIRTUAL_ENV"]
|
456
|
-
if "VIRTUAL_ENV_PROMPT" in env.keys():
|
457
|
-
del env["VIRTUAL_ENV_PROMPT"]
|
458
|
-
path = self._workdir / ".venv" / "Scripts" if sys.platform == "win32" else self._workdir / ".venv" / "bin"
|
459
|
-
if "PATH" in env.keys():
|
460
|
-
env["PATH"] = str(path) + os.pathsep + env["PATH"]
|
461
|
-
else:
|
462
|
-
env["PATH"] = str(path)
|
463
|
-
python = "python.exe" if sys.platform == "win32" else "python3"
|
464
|
-
executable = path / python
|
465
|
-
await self._launch_executable(executable, env)
|
466
|
-
|
467
|
-
async def _start_python_server_without_venv(self):
|
468
|
-
env: dict[str, str] = os.environ | self._config.remorph.env_vars
|
469
|
-
# ensure modules are searched locally before being searched in remorph
|
470
|
-
if "PYTHONPATH" in env.keys():
|
471
|
-
env["PYTHONPATH"] = str(self._workdir) + os.pathsep + env["PYTHONPATH"]
|
472
|
-
else:
|
473
|
-
env["PYTHONPATH"] = str(self._workdir)
|
474
|
-
executable = Path(self._config.remorph.command_line[0])
|
475
|
-
await self._launch_executable(executable, env)
|
435
|
+
async def _start_server(self) -> None:
|
436
|
+
# Sanity-check and split the command-line into components.
|
437
|
+
if not (command_line := self._config.remorph.command_line):
|
438
|
+
raise ValueError(f"Missing command line for LSP server: {self._config.path}")
|
439
|
+
executable, *args = command_line
|
476
440
|
|
477
|
-
|
441
|
+
# Extract the environment, preparing to ensure that PATH is set correctly.
|
478
442
|
env: dict[str, str] = os.environ | self._config.remorph.env_vars
|
479
|
-
|
480
|
-
|
481
|
-
|
482
|
-
|
483
|
-
|
484
|
-
|
485
|
-
|
486
|
-
|
487
|
-
|
488
|
-
|
489
|
-
|
443
|
+
path = env.get("PATH", os.defpath)
|
444
|
+
|
445
|
+
# If we have a virtual environment, ensure the bin directory is first on the PATH. This normally takes
|
446
|
+
# care of python executables, but also deals with any entry-points that the LSP server might install.
|
447
|
+
if (venv_path := self._workdir / ".venv").exists():
|
448
|
+
executable, additional_path = self._activate_venv(venv_path, executable)
|
449
|
+
# Ensure PATH is in sync with the search path we will use to locate the LSP server executable.
|
450
|
+
env["PATH"] = path = f"{additional_path}{os.pathsep}{path}"
|
451
|
+
logger.debug(f"Using PATH for launching LSP server: {path}")
|
452
|
+
|
453
|
+
# Locate the LSP server executable in a platform-independent way.
|
454
|
+
# Reference: https://docs.python.org/3/library/subprocess.html#popen-constructor
|
455
|
+
executable = shutil.which(executable, path=path) or executable
|
456
|
+
|
457
|
+
await self._launch_executable(executable, args, env)
|
458
|
+
|
459
|
+
@staticmethod
|
460
|
+
def _activate_venv(venv_path: Path, executable: str) -> tuple[str, Path]:
|
461
|
+
"""Obtain the bin/script directory for the virtual environment, to extend the search path."""
|
462
|
+
logger.debug(f"Detected virtual environment to use at: {venv_path}")
|
463
|
+
use_symlinks = sys.platform != "win32"
|
464
|
+
builder = venv.EnvBuilder(symlinks=use_symlinks)
|
465
|
+
context = builder.ensure_directories(venv_path)
|
466
|
+
|
467
|
+
# Workaround for Windows, where bin_path (Scripts/) doesn't contain python3.exe: if the executable is python
|
468
|
+
# or python3, we substitute it for what is needed to launch the venv's python interpreter.
|
469
|
+
if os.path.normcase(executable) in {"python", "python3"}:
|
470
|
+
executable = context.env_exec_cmd
|
471
|
+
|
472
|
+
return executable, context.bin_path
|
473
|
+
|
474
|
+
async def _launch_executable(self, executable: str, args: Sequence[str], env: Mapping[str, str]) -> None:
|
490
475
|
log_level = logging.getLevelName(logging.getLogger("databricks").level)
|
491
|
-
args =
|
492
|
-
logger.debug(f"Starting LSP engine: {executable} {args} (cwd={
|
493
|
-
await self._client.start_io(
|
476
|
+
args = [*args, f"--log_level={log_level}"]
|
477
|
+
logger.debug(f"Starting LSP engine: {executable} {args} (cwd={self._workdir})")
|
478
|
+
await self._client.start_io(executable, *args, env=env, cwd=self._workdir)
|
494
479
|
|
495
480
|
def _client_capabilities(self):
|
496
481
|
return ClientCapabilities() # TODO do we need to refine this ?
|
@@ -532,18 +517,18 @@ class LSPEngine(TranspileEngine):
|
|
532
517
|
|
533
518
|
def open_document(self, file_path: Path, source_code: str) -> None:
|
534
519
|
text_document = TextDocumentItem(
|
535
|
-
uri=file_path.as_uri(), language_id=LanguageKind.Sql, version=1, text=source_code
|
520
|
+
uri=file_path.absolute().as_uri(), language_id=LanguageKind.Sql, version=1, text=source_code
|
536
521
|
)
|
537
522
|
params = DidOpenTextDocumentParams(text_document)
|
538
523
|
self._client.text_document_did_open(params)
|
539
524
|
|
540
525
|
def close_document(self, file_path: Path) -> None:
|
541
|
-
text_document = TextDocumentIdentifier(uri=file_path.as_uri())
|
526
|
+
text_document = TextDocumentIdentifier(uri=file_path.absolute().as_uri())
|
542
527
|
params = DidCloseTextDocumentParams(text_document)
|
543
528
|
self._client.text_document_did_close(params)
|
544
529
|
|
545
530
|
async def transpile_document(self, file_path: Path) -> TranspileDocumentResult:
|
546
|
-
params = TranspileDocumentParams(uri=file_path.as_uri(), language_id=LanguageKind.Sql)
|
531
|
+
params = TranspileDocumentParams(uri=file_path.absolute().as_uri(), language_id=LanguageKind.Sql)
|
547
532
|
result = await self._client.transpile_document_async(params)
|
548
533
|
return result
|
549
534
|
|