databricks-labs-lakebridge 0.10.5__py3-none-any.whl → 0.10.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- databricks/labs/lakebridge/__about__.py +1 -1
- databricks/labs/lakebridge/analyzer/__init__.py +0 -0
- databricks/labs/lakebridge/analyzer/lakebridge_analyzer.py +95 -0
- databricks/labs/lakebridge/base_install.py +24 -3
- databricks/labs/lakebridge/cli.py +57 -72
- databricks/labs/lakebridge/config.py +1 -1
- databricks/labs/lakebridge/contexts/application.py +11 -4
- databricks/labs/lakebridge/deployment/dashboard.py +2 -1
- databricks/labs/lakebridge/deployment/installation.py +11 -11
- databricks/labs/lakebridge/deployment/job.py +2 -2
- databricks/labs/lakebridge/helpers/file_utils.py +36 -0
- databricks/labs/lakebridge/install.py +228 -278
- databricks/labs/lakebridge/reconcile/compare.py +70 -33
- databricks/labs/lakebridge/reconcile/connectors/data_source.py +19 -0
- databricks/labs/lakebridge/reconcile/connectors/databricks.py +11 -1
- databricks/labs/lakebridge/reconcile/connectors/dialect_utils.py +126 -0
- databricks/labs/lakebridge/reconcile/connectors/models.py +7 -0
- databricks/labs/lakebridge/reconcile/connectors/oracle.py +11 -1
- databricks/labs/lakebridge/reconcile/connectors/snowflake.py +14 -2
- databricks/labs/lakebridge/reconcile/connectors/tsql.py +27 -2
- databricks/labs/lakebridge/reconcile/constants.py +4 -3
- databricks/labs/lakebridge/reconcile/execute.py +9 -810
- databricks/labs/lakebridge/reconcile/normalize_recon_config_service.py +133 -0
- databricks/labs/lakebridge/reconcile/query_builder/base.py +3 -7
- databricks/labs/lakebridge/reconcile/recon_config.py +3 -0
- databricks/labs/lakebridge/reconcile/recon_output_config.py +2 -1
- databricks/labs/lakebridge/reconcile/reconciliation.py +508 -0
- databricks/labs/lakebridge/reconcile/schema_compare.py +26 -19
- databricks/labs/lakebridge/reconcile/trigger_recon_aggregate_service.py +98 -0
- databricks/labs/lakebridge/reconcile/trigger_recon_service.py +253 -0
- databricks/labs/lakebridge/reconcile/utils.py +38 -0
- databricks/labs/lakebridge/transpiler/lsp/lsp_engine.py +48 -63
- databricks/labs/lakebridge/transpiler/repository.py +123 -0
- databricks/labs/lakebridge/transpiler/sqlglot/dialect_utils.py +2 -0
- databricks/labs/lakebridge/transpiler/transpile_engine.py +0 -18
- {databricks_labs_lakebridge-0.10.5.dist-info → databricks_labs_lakebridge-0.10.7.dist-info}/METADATA +1 -1
- {databricks_labs_lakebridge-0.10.5.dist-info → databricks_labs_lakebridge-0.10.7.dist-info}/RECORD +41 -31
- {databricks_labs_lakebridge-0.10.5.dist-info → databricks_labs_lakebridge-0.10.7.dist-info}/WHEEL +0 -0
- {databricks_labs_lakebridge-0.10.5.dist-info → databricks_labs_lakebridge-0.10.7.dist-info}/entry_points.txt +0 -0
- {databricks_labs_lakebridge-0.10.5.dist-info → databricks_labs_lakebridge-0.10.7.dist-info}/licenses/LICENSE +0 -0
- {databricks_labs_lakebridge-0.10.5.dist-info → databricks_labs_lakebridge-0.10.7.dist-info}/licenses/NOTICE +0 -0
@@ -3,6 +3,7 @@ from functools import reduce
|
|
3
3
|
from pyspark.sql import DataFrame, SparkSession
|
4
4
|
from pyspark.sql.functions import col, expr, lit
|
5
5
|
|
6
|
+
from databricks.labs.lakebridge.reconcile.connectors.dialect_utils import DialectUtils
|
6
7
|
from databricks.labs.lakebridge.reconcile.exception import ColumnMismatchException
|
7
8
|
from databricks.labs.lakebridge.reconcile.recon_capture import (
|
8
9
|
ReconIntermediatePersist,
|
@@ -22,7 +23,7 @@ _HASH_COLUMN_NAME = "hash_value_recon"
|
|
22
23
|
_SAMPLE_ROWS = 50
|
23
24
|
|
24
25
|
|
25
|
-
def
|
26
|
+
def _raise_column_mismatch_exception(msg: str, source_missing: list[str], target_missing: list[str]) -> Exception:
|
26
27
|
error_msg = (
|
27
28
|
f"{msg}\n"
|
28
29
|
f"columns missing in source: {','.join(source_missing) if source_missing else None}\n"
|
@@ -33,12 +34,25 @@ def raise_column_mismatch_exception(msg: str, source_missing: list[str], target_
|
|
33
34
|
|
34
35
|
def _generate_join_condition(source_alias, target_alias, key_columns):
|
35
36
|
conditions = [
|
36
|
-
col(f"{source_alias}.{key_column}").eqNullSafe(
|
37
|
+
col(f"{source_alias}.{DialectUtils.ansi_normalize_identifier(key_column)}").eqNullSafe(
|
38
|
+
col(f"{target_alias}.{DialectUtils.ansi_normalize_identifier(key_column)}")
|
39
|
+
)
|
37
40
|
for key_column in key_columns
|
38
41
|
]
|
39
42
|
return reduce(lambda a, b: a & b, conditions)
|
40
43
|
|
41
44
|
|
45
|
+
def _build_column_selector(table_name, column_name):
|
46
|
+
alias = DialectUtils.ansi_normalize_identifier(f"{table_name}_{DialectUtils.unnormalize_identifier(column_name)}")
|
47
|
+
return f'{table_name}.{DialectUtils.ansi_normalize_identifier(column_name)} as {alias}'
|
48
|
+
|
49
|
+
|
50
|
+
def _build_mismatch_column(table, column):
|
51
|
+
return col(DialectUtils.ansi_normalize_identifier(column)).alias(
|
52
|
+
DialectUtils.unnormalize_identifier(column.replace(f'{table}_', '').lower())
|
53
|
+
)
|
54
|
+
|
55
|
+
|
42
56
|
def reconcile_data(
|
43
57
|
source: DataFrame,
|
44
58
|
target: DataFrame,
|
@@ -59,14 +73,14 @@ def reconcile_data(
|
|
59
73
|
how="full",
|
60
74
|
)
|
61
75
|
.selectExpr(
|
62
|
-
*[f'{source_alias
|
63
|
-
*[f'{target_alias
|
76
|
+
*[f'{_build_column_selector(source_alias, col_name)}' for col_name in source.columns],
|
77
|
+
*[f'{_build_column_selector(target_alias, col_name)}' for col_name in target.columns],
|
64
78
|
)
|
65
79
|
)
|
66
80
|
|
67
81
|
# Write unmatched df to volume
|
68
82
|
df = ReconIntermediatePersist(spark, path).write_and_read_unmatched_df_with_volumes(df)
|
69
|
-
logger.warning(f"Unmatched data
|
83
|
+
logger.warning(f"Unmatched data was written to {path} successfully")
|
70
84
|
|
71
85
|
mismatch = _get_mismatch_data(df, source_alias, target_alias) if report_type in {"all", "data"} else None
|
72
86
|
|
@@ -74,24 +88,24 @@ def reconcile_data(
|
|
74
88
|
df.filter(col(f"{source_alias}_{_HASH_COLUMN_NAME}").isNull())
|
75
89
|
.select(
|
76
90
|
*[
|
77
|
-
|
91
|
+
_build_mismatch_column(target_alias, col_name)
|
78
92
|
for col_name in df.columns
|
79
93
|
if col_name.startswith(f'{target_alias}_')
|
80
94
|
]
|
81
95
|
)
|
82
|
-
.drop(_HASH_COLUMN_NAME)
|
96
|
+
.drop(f"{_HASH_COLUMN_NAME}")
|
83
97
|
)
|
84
98
|
|
85
99
|
missing_in_tgt = (
|
86
100
|
df.filter(col(f"{target_alias}_{_HASH_COLUMN_NAME}").isNull())
|
87
101
|
.select(
|
88
102
|
*[
|
89
|
-
|
103
|
+
_build_mismatch_column(source_alias, col_name)
|
90
104
|
for col_name in df.columns
|
91
105
|
if col_name.startswith(f'{source_alias}_')
|
92
106
|
]
|
93
107
|
)
|
94
|
-
.drop(_HASH_COLUMN_NAME)
|
108
|
+
.drop(f"{_HASH_COLUMN_NAME}")
|
95
109
|
)
|
96
110
|
mismatch_count = 0
|
97
111
|
if mismatch:
|
@@ -123,23 +137,27 @@ def _get_mismatch_data(df: DataFrame, src_alias: str, tgt_alias: str) -> DataFra
|
|
123
137
|
.filter(col("hash_match") == lit(False))
|
124
138
|
.select(
|
125
139
|
*[
|
126
|
-
|
140
|
+
_build_mismatch_column(src_alias, col_name)
|
127
141
|
for col_name in df.columns
|
128
142
|
if col_name.startswith(f'{src_alias}_')
|
129
143
|
]
|
130
144
|
)
|
131
|
-
.drop(_HASH_COLUMN_NAME)
|
145
|
+
.drop(f"{_HASH_COLUMN_NAME}")
|
132
146
|
)
|
133
147
|
|
134
148
|
|
135
|
-
def
|
136
|
-
|
137
|
-
|
149
|
+
def _build_capture_df(df: DataFrame) -> DataFrame:
|
150
|
+
columns = [
|
151
|
+
col(DialectUtils.ansi_normalize_identifier(column)).alias(DialectUtils.unnormalize_identifier(column))
|
152
|
+
for column in df.columns
|
153
|
+
]
|
154
|
+
return df.select(*columns)
|
138
155
|
|
139
156
|
|
140
157
|
def capture_mismatch_data_and_columns(source: DataFrame, target: DataFrame, key_columns: list[str]) -> MismatchOutput:
|
141
|
-
source_df =
|
142
|
-
target_df =
|
158
|
+
source_df = _build_capture_df(source)
|
159
|
+
target_df = _build_capture_df(target)
|
160
|
+
unnormalized_key_columns = [DialectUtils.unnormalize_identifier(column) for column in key_columns]
|
143
161
|
|
144
162
|
source_columns = source_df.columns
|
145
163
|
target_columns = target_df.columns
|
@@ -148,10 +166,10 @@ def capture_mismatch_data_and_columns(source: DataFrame, target: DataFrame, key_
|
|
148
166
|
message = "source and target should have same columns for capturing the mismatch data"
|
149
167
|
source_missing = [column for column in target_columns if column not in source_columns]
|
150
168
|
target_missing = [column for column in source_columns if column not in target_columns]
|
151
|
-
raise
|
169
|
+
raise _raise_column_mismatch_exception(message, source_missing, target_missing)
|
152
170
|
|
153
|
-
check_columns = [column for column in source_columns if column not in
|
154
|
-
mismatch_df = _get_mismatch_df(source_df, target_df,
|
171
|
+
check_columns = [column for column in source_columns if column not in unnormalized_key_columns]
|
172
|
+
mismatch_df = _get_mismatch_df(source_df, target_df, unnormalized_key_columns, check_columns)
|
155
173
|
mismatch_columns = _get_mismatch_columns(mismatch_df, check_columns)
|
156
174
|
return MismatchOutput(mismatch_df, mismatch_columns)
|
157
175
|
|
@@ -167,31 +185,50 @@ def _get_mismatch_columns(df: DataFrame, columns: list[str]):
|
|
167
185
|
return mismatch_columns
|
168
186
|
|
169
187
|
|
188
|
+
def _normalize_mismatch_df_col(column, suffix):
|
189
|
+
unnormalized = DialectUtils.unnormalize_identifier(column) + suffix
|
190
|
+
return DialectUtils.ansi_normalize_identifier(unnormalized)
|
191
|
+
|
192
|
+
|
193
|
+
def _unnormalize_mismatch_df_col(column, suffix):
|
194
|
+
unnormalized = DialectUtils.unnormalize_identifier(column) + suffix
|
195
|
+
return unnormalized
|
196
|
+
|
197
|
+
|
170
198
|
def _get_mismatch_df(source: DataFrame, target: DataFrame, key_columns: list[str], column_list: list[str]):
|
171
|
-
source_aliased = [
|
172
|
-
|
199
|
+
source_aliased = [
|
200
|
+
col('base.' + DialectUtils.ansi_normalize_identifier(column)).alias(
|
201
|
+
_unnormalize_mismatch_df_col(column, '_base')
|
202
|
+
)
|
203
|
+
for column in column_list
|
204
|
+
]
|
205
|
+
target_aliased = [
|
206
|
+
col('compare.' + DialectUtils.ansi_normalize_identifier(column)).alias(
|
207
|
+
_unnormalize_mismatch_df_col(column, '_compare')
|
208
|
+
)
|
209
|
+
for column in column_list
|
210
|
+
]
|
173
211
|
|
174
|
-
match_expr = [
|
175
|
-
|
212
|
+
match_expr = [
|
213
|
+
expr(f"{_normalize_mismatch_df_col(column,'_base')}=={_normalize_mismatch_df_col(column,'_compare')}").alias(
|
214
|
+
_unnormalize_mismatch_df_col(column, '_match')
|
215
|
+
)
|
216
|
+
for column in column_list
|
217
|
+
]
|
218
|
+
key_cols = [col(DialectUtils.ansi_normalize_identifier(column)) for column in key_columns]
|
176
219
|
select_expr = key_cols + source_aliased + target_aliased + match_expr
|
177
220
|
|
178
|
-
filter_columns = " and ".join([column + "_match" for column in column_list])
|
179
|
-
filter_expr = ~expr(filter_columns)
|
180
|
-
|
181
221
|
logger.info(f"KEY COLUMNS: {key_columns}")
|
182
|
-
logger.info(f"FILTER COLUMNS: {filter_expr}")
|
183
222
|
logger.info(f"SELECT COLUMNS: {select_expr}")
|
184
223
|
|
185
224
|
mismatch_df = (
|
186
225
|
source.alias('base').join(other=target.alias('compare'), on=key_columns, how="inner").select(*select_expr)
|
187
226
|
)
|
188
227
|
|
189
|
-
compare_columns = [
|
190
|
-
|
191
|
-
|
192
|
-
|
193
|
-
def alias_column_str(alias: str, columns: list[str]) -> list[str]:
|
194
|
-
return [f"{alias}.{column}" for column in columns]
|
228
|
+
compare_columns = [
|
229
|
+
DialectUtils.ansi_normalize_identifier(column) for column in mismatch_df.columns if column not in key_columns
|
230
|
+
]
|
231
|
+
return mismatch_df.select(*key_cols + sorted(compare_columns))
|
195
232
|
|
196
233
|
|
197
234
|
def _generate_agg_join_condition(source_alias: str, target_alias: str, key_columns: list[str]):
|
@@ -3,6 +3,7 @@ from abc import ABC, abstractmethod
|
|
3
3
|
|
4
4
|
from pyspark.sql import DataFrame
|
5
5
|
|
6
|
+
from databricks.labs.lakebridge.reconcile.connectors.models import NormalizedIdentifier
|
6
7
|
from databricks.labs.lakebridge.reconcile.exception import DataSourceRuntimeException
|
7
8
|
from databricks.labs.lakebridge.reconcile.recon_config import JdbcReaderOptions, Schema
|
8
9
|
|
@@ -31,12 +32,27 @@ class DataSource(ABC):
|
|
31
32
|
) -> list[Schema]:
|
32
33
|
return NotImplemented
|
33
34
|
|
35
|
+
@abstractmethod
|
36
|
+
def normalize_identifier(self, identifier: str) -> NormalizedIdentifier:
|
37
|
+
pass
|
38
|
+
|
34
39
|
@classmethod
|
35
40
|
def log_and_throw_exception(cls, exception: Exception, fetch_type: str, query: str):
|
36
41
|
error_msg = f"Runtime exception occurred while fetching {fetch_type} using {query} : {exception}"
|
37
42
|
logger.warning(error_msg)
|
38
43
|
raise DataSourceRuntimeException(error_msg) from exception
|
39
44
|
|
45
|
+
def _map_meta_column(self, meta_column) -> Schema:
|
46
|
+
"""Create a normalized Schema DTO from the database metadata
|
47
|
+
|
48
|
+
Used in the implementations of get_schema to build a Schema DTO from the `INFORMATION_SCHEMA` query result.
|
49
|
+
The returned Schema is normalized in case the database is having columns with special characters and standardize
|
50
|
+
"""
|
51
|
+
name = meta_column.col_name
|
52
|
+
dtype = meta_column.data_type.strip().lower()
|
53
|
+
normalized = self.normalize_identifier(name)
|
54
|
+
return Schema(normalized.ansi_normalized, dtype, normalized.ansi_normalized, normalized.source_normalized)
|
55
|
+
|
40
56
|
|
41
57
|
class MockDataSource(DataSource):
|
42
58
|
|
@@ -70,3 +86,6 @@ class MockDataSource(DataSource):
|
|
70
86
|
if not mock_schema:
|
71
87
|
return self.log_and_throw_exception(self._exception, "schema", f"({catalog}, {schema}, {table})")
|
72
88
|
return mock_schema
|
89
|
+
|
90
|
+
def normalize_identifier(self, identifier: str) -> NormalizedIdentifier:
|
91
|
+
return NormalizedIdentifier(identifier, identifier)
|
@@ -8,7 +8,9 @@ from pyspark.sql.functions import col
|
|
8
8
|
from sqlglot import Dialect
|
9
9
|
|
10
10
|
from databricks.labs.lakebridge.reconcile.connectors.data_source import DataSource
|
11
|
+
from databricks.labs.lakebridge.reconcile.connectors.models import NormalizedIdentifier
|
11
12
|
from databricks.labs.lakebridge.reconcile.connectors.secrets import SecretsMixin
|
13
|
+
from databricks.labs.lakebridge.reconcile.connectors.dialect_utils import DialectUtils
|
12
14
|
from databricks.labs.lakebridge.reconcile.recon_config import JdbcReaderOptions, Schema
|
13
15
|
from databricks.sdk import WorkspaceClient
|
14
16
|
|
@@ -35,6 +37,7 @@ def _get_schema_query(catalog: str, schema: str, table: str):
|
|
35
37
|
|
36
38
|
|
37
39
|
class DatabricksDataSource(DataSource, SecretsMixin):
|
40
|
+
_IDENTIFIER_DELIMITER = "`"
|
38
41
|
|
39
42
|
def __init__(
|
40
43
|
self,
|
@@ -82,6 +85,13 @@ class DatabricksDataSource(DataSource, SecretsMixin):
|
|
82
85
|
logger.info(f"Fetching Schema: Started at: {datetime.now()}")
|
83
86
|
schema_metadata = self._spark.sql(schema_query).where("col_name not like '#%'").distinct().collect()
|
84
87
|
logger.info(f"Schema fetched successfully. Completed at: {datetime.now()}")
|
85
|
-
return [
|
88
|
+
return [self._map_meta_column(field) for field in schema_metadata]
|
86
89
|
except (RuntimeError, PySparkException) as e:
|
87
90
|
return self.log_and_throw_exception(e, "schema", schema_query)
|
91
|
+
|
92
|
+
def normalize_identifier(self, identifier: str) -> NormalizedIdentifier:
|
93
|
+
return DialectUtils.normalize_identifier(
|
94
|
+
identifier,
|
95
|
+
source_start_delimiter=DatabricksDataSource._IDENTIFIER_DELIMITER,
|
96
|
+
source_end_delimiter=DatabricksDataSource._IDENTIFIER_DELIMITER,
|
97
|
+
)
|
@@ -0,0 +1,126 @@
|
|
1
|
+
from databricks.labs.lakebridge.reconcile.connectors.models import NormalizedIdentifier
|
2
|
+
|
3
|
+
|
4
|
+
class DialectUtils:
|
5
|
+
_ANSI_IDENTIFIER_DELIMITER = "`"
|
6
|
+
|
7
|
+
@staticmethod
|
8
|
+
def unnormalize_identifier(identifier: str) -> str:
|
9
|
+
"""Return an ansi identifier without the outer backticks.
|
10
|
+
|
11
|
+
Use this at your own risk as the missing outer backticks will result in bugs.
|
12
|
+
E.g. <`mary's lamb`> is returned <mary's lamb> so the outer backticks are needed.
|
13
|
+
This is useful for scenarios where the returned identifier will be part of another delimited identifier.
|
14
|
+
|
15
|
+
:param identifier: a database identifier
|
16
|
+
:return: ansi identifier without the outer backticks
|
17
|
+
"""
|
18
|
+
ansi = DialectUtils.ansi_normalize_identifier(identifier)
|
19
|
+
unescape = (
|
20
|
+
DialectUtils._unescape_source_end_delimiter(ansi[1:-1], DialectUtils._ANSI_IDENTIFIER_DELIMITER)
|
21
|
+
if ansi
|
22
|
+
else ansi
|
23
|
+
)
|
24
|
+
return unescape
|
25
|
+
|
26
|
+
@staticmethod
|
27
|
+
def ansi_normalize_identifier(identifier: str) -> str:
|
28
|
+
return DialectUtils.normalize_identifier(
|
29
|
+
identifier, DialectUtils._ANSI_IDENTIFIER_DELIMITER, DialectUtils._ANSI_IDENTIFIER_DELIMITER
|
30
|
+
).ansi_normalized
|
31
|
+
|
32
|
+
@staticmethod
|
33
|
+
def normalize_identifier(
|
34
|
+
identifier: str, source_start_delimiter: str, source_end_delimiter: str
|
35
|
+
) -> NormalizedIdentifier:
|
36
|
+
identifier = identifier.strip().lower()
|
37
|
+
|
38
|
+
ansi = DialectUtils._normalize_identifier_source_agnostic(
|
39
|
+
identifier,
|
40
|
+
source_start_delimiter,
|
41
|
+
source_end_delimiter,
|
42
|
+
DialectUtils._ANSI_IDENTIFIER_DELIMITER,
|
43
|
+
DialectUtils._ANSI_IDENTIFIER_DELIMITER,
|
44
|
+
)
|
45
|
+
|
46
|
+
# Input was already ansi normalized
|
47
|
+
if ansi == identifier:
|
48
|
+
source = DialectUtils._normalize_identifier_source_agnostic(
|
49
|
+
identifier,
|
50
|
+
DialectUtils._ANSI_IDENTIFIER_DELIMITER,
|
51
|
+
DialectUtils._ANSI_IDENTIFIER_DELIMITER,
|
52
|
+
source_start_delimiter,
|
53
|
+
source_end_delimiter,
|
54
|
+
)
|
55
|
+
|
56
|
+
# Ansi has backticks escaped which has to be unescaped for other delimiters and escape source end delimiters
|
57
|
+
if source != ansi:
|
58
|
+
source = DialectUtils._unescape_source_end_delimiter(source, DialectUtils._ANSI_IDENTIFIER_DELIMITER)
|
59
|
+
source = (
|
60
|
+
DialectUtils._escape_source_end_delimiter(source, source_start_delimiter, source_end_delimiter)
|
61
|
+
if source
|
62
|
+
else source
|
63
|
+
)
|
64
|
+
else:
|
65
|
+
# Make sure backticks are escaped properly for ansi and source end delimiters are unescaped
|
66
|
+
ansi = DialectUtils._unescape_source_end_delimiter(ansi, source_end_delimiter)
|
67
|
+
ansi = DialectUtils._escape_backticks(ansi) if ansi else ansi
|
68
|
+
|
69
|
+
if source_end_delimiter != DialectUtils._ANSI_IDENTIFIER_DELIMITER:
|
70
|
+
ansi = DialectUtils._unescape_source_end_delimiter(ansi, source_end_delimiter)
|
71
|
+
|
72
|
+
source = DialectUtils._normalize_identifier_source_agnostic(
|
73
|
+
identifier, source_start_delimiter, source_end_delimiter, source_start_delimiter, source_end_delimiter
|
74
|
+
)
|
75
|
+
|
76
|
+
# Make sure source end delimiter is escaped else nothing as it was already normalized
|
77
|
+
if source != identifier:
|
78
|
+
source = (
|
79
|
+
DialectUtils._escape_source_end_delimiter(source, source_start_delimiter, source_end_delimiter)
|
80
|
+
if source
|
81
|
+
else source
|
82
|
+
)
|
83
|
+
|
84
|
+
return NormalizedIdentifier(ansi, source)
|
85
|
+
|
86
|
+
@staticmethod
|
87
|
+
def _normalize_identifier_source_agnostic(
|
88
|
+
identifier: str,
|
89
|
+
source_start_delimiter: str,
|
90
|
+
source_end_delimiter: str,
|
91
|
+
expected_source_start_delimiter: str,
|
92
|
+
expected_source_end_delimiter: str,
|
93
|
+
) -> str:
|
94
|
+
if identifier == "" or identifier is None:
|
95
|
+
return ""
|
96
|
+
|
97
|
+
if DialectUtils.is_already_delimited(
|
98
|
+
identifier, expected_source_start_delimiter, expected_source_end_delimiter
|
99
|
+
):
|
100
|
+
return identifier
|
101
|
+
|
102
|
+
if DialectUtils.is_already_delimited(identifier, source_start_delimiter, source_end_delimiter):
|
103
|
+
stripped_identifier = identifier.removeprefix(source_start_delimiter).removesuffix(source_end_delimiter)
|
104
|
+
else:
|
105
|
+
stripped_identifier = identifier
|
106
|
+
return f"{expected_source_start_delimiter}{stripped_identifier}{expected_source_end_delimiter}"
|
107
|
+
|
108
|
+
@staticmethod
|
109
|
+
def is_already_delimited(identifier: str, start_delimiter: str, end_delimiter: str) -> bool:
|
110
|
+
return identifier.startswith(start_delimiter) and identifier.endswith(end_delimiter)
|
111
|
+
|
112
|
+
@staticmethod
|
113
|
+
def _escape_backticks(identifier: str) -> str:
|
114
|
+
identifier = identifier[1:-1]
|
115
|
+
identifier = identifier.replace("`", "``")
|
116
|
+
return f"`{identifier}`"
|
117
|
+
|
118
|
+
@staticmethod
|
119
|
+
def _unescape_source_end_delimiter(identifier: str, source_end_delimiter: str) -> str:
|
120
|
+
return identifier.replace(f"{source_end_delimiter}{source_end_delimiter}", source_end_delimiter)
|
121
|
+
|
122
|
+
@staticmethod
|
123
|
+
def _escape_source_end_delimiter(identifier: str, start_end_delimiter, source_end_delimiter: str) -> str:
|
124
|
+
identifier = identifier[1:-1]
|
125
|
+
identifier = identifier.replace(source_end_delimiter, f"{source_end_delimiter}{source_end_delimiter}")
|
126
|
+
return f"{start_end_delimiter}{identifier}{source_end_delimiter}"
|
@@ -9,7 +9,9 @@ from sqlglot import Dialect
|
|
9
9
|
|
10
10
|
from databricks.labs.lakebridge.reconcile.connectors.data_source import DataSource
|
11
11
|
from databricks.labs.lakebridge.reconcile.connectors.jdbc_reader import JDBCReaderMixin
|
12
|
+
from databricks.labs.lakebridge.reconcile.connectors.models import NormalizedIdentifier
|
12
13
|
from databricks.labs.lakebridge.reconcile.connectors.secrets import SecretsMixin
|
14
|
+
from databricks.labs.lakebridge.reconcile.connectors.dialect_utils import DialectUtils
|
13
15
|
from databricks.labs.lakebridge.reconcile.recon_config import JdbcReaderOptions, Schema
|
14
16
|
from databricks.sdk import WorkspaceClient
|
15
17
|
|
@@ -18,6 +20,7 @@ logger = logging.getLogger(__name__)
|
|
18
20
|
|
19
21
|
class OracleDataSource(DataSource, SecretsMixin, JDBCReaderMixin):
|
20
22
|
_DRIVER = "oracle"
|
23
|
+
_IDENTIFIER_DELIMITER = "\""
|
21
24
|
_SCHEMA_QUERY = """select column_name, case when (data_precision is not null
|
22
25
|
and data_scale <> 0)
|
23
26
|
then data_type || '(' || data_precision || ',' || data_scale || ')'
|
@@ -91,7 +94,7 @@ class OracleDataSource(DataSource, SecretsMixin, JDBCReaderMixin):
|
|
91
94
|
schema_metadata = df.select([col(c).alias(c.lower()) for c in df.columns]).collect()
|
92
95
|
logger.info(f"Schema fetched successfully. Completed at: {datetime.now()}")
|
93
96
|
logger.debug(f"schema_metadata: ${schema_metadata}")
|
94
|
-
return [
|
97
|
+
return [self._map_meta_column(field) for field in schema_metadata]
|
95
98
|
except (RuntimeError, PySparkException) as e:
|
96
99
|
return self.log_and_throw_exception(e, "schema", schema_query)
|
97
100
|
|
@@ -106,3 +109,10 @@ class OracleDataSource(DataSource, SecretsMixin, JDBCReaderMixin):
|
|
106
109
|
|
107
110
|
def reader(self, query: str) -> DataFrameReader:
|
108
111
|
return self._get_jdbc_reader(query, self.get_jdbc_url, OracleDataSource._DRIVER)
|
112
|
+
|
113
|
+
def normalize_identifier(self, identifier: str) -> NormalizedIdentifier:
|
114
|
+
return DialectUtils.normalize_identifier(
|
115
|
+
identifier,
|
116
|
+
source_start_delimiter=OracleDataSource._IDENTIFIER_DELIMITER,
|
117
|
+
source_end_delimiter=OracleDataSource._IDENTIFIER_DELIMITER,
|
118
|
+
)
|
@@ -11,7 +11,9 @@ from cryptography.hazmat.primitives import serialization
|
|
11
11
|
|
12
12
|
from databricks.labs.lakebridge.reconcile.connectors.data_source import DataSource
|
13
13
|
from databricks.labs.lakebridge.reconcile.connectors.jdbc_reader import JDBCReaderMixin
|
14
|
+
from databricks.labs.lakebridge.reconcile.connectors.models import NormalizedIdentifier
|
14
15
|
from databricks.labs.lakebridge.reconcile.connectors.secrets import SecretsMixin
|
16
|
+
from databricks.labs.lakebridge.reconcile.connectors.dialect_utils import DialectUtils
|
15
17
|
from databricks.labs.lakebridge.reconcile.exception import InvalidSnowflakePemPrivateKey
|
16
18
|
from databricks.labs.lakebridge.reconcile.recon_config import JdbcReaderOptions, Schema
|
17
19
|
from databricks.sdk import WorkspaceClient
|
@@ -22,6 +24,8 @@ logger = logging.getLogger(__name__)
|
|
22
24
|
|
23
25
|
class SnowflakeDataSource(DataSource, SecretsMixin, JDBCReaderMixin):
|
24
26
|
_DRIVER = "snowflake"
|
27
|
+
_IDENTIFIER_DELIMITER = "\""
|
28
|
+
|
25
29
|
"""
|
26
30
|
* INFORMATION_SCHEMA:
|
27
31
|
- see https://docs.snowflake.com/en/sql-reference/info-schema#considerations-for-replacing-show-commands-with-information-schema-views
|
@@ -144,9 +148,10 @@ class SnowflakeDataSource(DataSource, SecretsMixin, JDBCReaderMixin):
|
|
144
148
|
try:
|
145
149
|
logger.debug(f"Fetching schema using query: \n`{schema_query}`")
|
146
150
|
logger.info(f"Fetching Schema: Started at: {datetime.now()}")
|
147
|
-
|
151
|
+
df = self.reader(schema_query).load()
|
152
|
+
schema_metadata = df.select([col(c).alias(c.lower()) for c in df.columns]).collect()
|
148
153
|
logger.info(f"Schema fetched successfully. Completed at: {datetime.now()}")
|
149
|
-
return [
|
154
|
+
return [self._map_meta_column(field) for field in schema_metadata]
|
150
155
|
except (RuntimeError, PySparkException) as e:
|
151
156
|
return self.log_and_throw_exception(e, "schema", schema_query)
|
152
157
|
|
@@ -171,3 +176,10 @@ class SnowflakeDataSource(DataSource, SecretsMixin, JDBCReaderMixin):
|
|
171
176
|
raise NotFound(message) from e
|
172
177
|
|
173
178
|
return self._spark.read.format("snowflake").option("dbtable", f"({query}) as tmp").options(**options)
|
179
|
+
|
180
|
+
def normalize_identifier(self, identifier: str) -> NormalizedIdentifier:
|
181
|
+
return DialectUtils.normalize_identifier(
|
182
|
+
identifier,
|
183
|
+
source_start_delimiter=SnowflakeDataSource._IDENTIFIER_DELIMITER,
|
184
|
+
source_end_delimiter=SnowflakeDataSource._IDENTIFIER_DELIMITER,
|
185
|
+
)
|
@@ -9,7 +9,9 @@ from sqlglot import Dialect
|
|
9
9
|
|
10
10
|
from databricks.labs.lakebridge.reconcile.connectors.data_source import DataSource
|
11
11
|
from databricks.labs.lakebridge.reconcile.connectors.jdbc_reader import JDBCReaderMixin
|
12
|
+
from databricks.labs.lakebridge.reconcile.connectors.models import NormalizedIdentifier
|
12
13
|
from databricks.labs.lakebridge.reconcile.connectors.secrets import SecretsMixin
|
14
|
+
from databricks.labs.lakebridge.reconcile.connectors.dialect_utils import DialectUtils
|
13
15
|
from databricks.labs.lakebridge.reconcile.recon_config import JdbcReaderOptions, Schema
|
14
16
|
from databricks.sdk import WorkspaceClient
|
15
17
|
|
@@ -49,6 +51,7 @@ _SCHEMA_QUERY = """SELECT
|
|
49
51
|
|
50
52
|
class TSQLServerDataSource(DataSource, SecretsMixin, JDBCReaderMixin):
|
51
53
|
_DRIVER = "sqlserver"
|
54
|
+
_IDENTIFIER_DELIMITER = {"prefix": "[", "suffix": "]"}
|
52
55
|
|
53
56
|
def __init__(
|
54
57
|
self,
|
@@ -122,11 +125,33 @@ class TSQLServerDataSource(DataSource, SecretsMixin, JDBCReaderMixin):
|
|
122
125
|
try:
|
123
126
|
logger.debug(f"Fetching schema using query: \n`{schema_query}`")
|
124
127
|
logger.info(f"Fetching Schema: Started at: {datetime.now()}")
|
125
|
-
|
128
|
+
df = self.reader(schema_query).load()
|
129
|
+
schema_metadata = df.select([col(c).alias(c.lower()) for c in df.columns]).collect()
|
126
130
|
logger.info(f"Schema fetched successfully. Completed at: {datetime.now()}")
|
127
|
-
return [
|
131
|
+
return [self._map_meta_column(field) for field in schema_metadata]
|
128
132
|
except (RuntimeError, PySparkException) as e:
|
129
133
|
return self.log_and_throw_exception(e, "schema", schema_query)
|
130
134
|
|
131
135
|
def reader(self, query: str, prepare_query_str="") -> DataFrameReader:
|
132
136
|
return self._get_jdbc_reader(query, self.get_jdbc_url, self._DRIVER, prepare_query_str)
|
137
|
+
|
138
|
+
def normalize_identifier(self, identifier: str) -> NormalizedIdentifier:
|
139
|
+
return DialectUtils.normalize_identifier(
|
140
|
+
TSQLServerDataSource._normalize_quotes(identifier),
|
141
|
+
source_start_delimiter=TSQLServerDataSource._IDENTIFIER_DELIMITER["prefix"],
|
142
|
+
source_end_delimiter=TSQLServerDataSource._IDENTIFIER_DELIMITER["suffix"],
|
143
|
+
)
|
144
|
+
|
145
|
+
@staticmethod
|
146
|
+
def _normalize_quotes(identifier: str):
|
147
|
+
if DialectUtils.is_already_delimited(identifier, '"', '"'):
|
148
|
+
identifier = identifier[1:-1]
|
149
|
+
identifier = identifier.replace('""', '"')
|
150
|
+
identifier = (
|
151
|
+
TSQLServerDataSource._IDENTIFIER_DELIMITER["prefix"]
|
152
|
+
+ identifier
|
153
|
+
+ TSQLServerDataSource._IDENTIFIER_DELIMITER["suffix"]
|
154
|
+
)
|
155
|
+
return identifier
|
156
|
+
|
157
|
+
return identifier
|
@@ -15,10 +15,11 @@ class AutoName(Enum):
|
|
15
15
|
|
16
16
|
|
17
17
|
class ReconSourceType(AutoName):
|
18
|
-
SNOWFLAKE = auto()
|
19
|
-
ORACLE = auto()
|
20
18
|
DATABRICKS = auto()
|
21
|
-
|
19
|
+
MSSQL = auto()
|
20
|
+
ORACLE = auto()
|
21
|
+
SNOWFLAKE = auto()
|
22
|
+
SYNAPSE = auto()
|
22
23
|
|
23
24
|
|
24
25
|
class ReconReportType(AutoName):
|