databricks-labs-lakebridge 0.10.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (171) hide show
  1. databricks/__init__.py +3 -0
  2. databricks/labs/__init__.py +3 -0
  3. databricks/labs/lakebridge/__about__.py +2 -0
  4. databricks/labs/lakebridge/__init__.py +11 -0
  5. databricks/labs/lakebridge/assessments/configure_assessment.py +194 -0
  6. databricks/labs/lakebridge/assessments/pipeline.py +188 -0
  7. databricks/labs/lakebridge/assessments/profiler_config.py +30 -0
  8. databricks/labs/lakebridge/base_install.py +12 -0
  9. databricks/labs/lakebridge/cli.py +449 -0
  10. databricks/labs/lakebridge/config.py +192 -0
  11. databricks/labs/lakebridge/connections/__init__.py +0 -0
  12. databricks/labs/lakebridge/connections/credential_manager.py +89 -0
  13. databricks/labs/lakebridge/connections/database_manager.py +98 -0
  14. databricks/labs/lakebridge/connections/env_getter.py +13 -0
  15. databricks/labs/lakebridge/contexts/__init__.py +0 -0
  16. databricks/labs/lakebridge/contexts/application.py +133 -0
  17. databricks/labs/lakebridge/coverage/__init__.py +0 -0
  18. databricks/labs/lakebridge/coverage/commons.py +223 -0
  19. databricks/labs/lakebridge/coverage/lakebridge_snow_transpilation_coverage.py +29 -0
  20. databricks/labs/lakebridge/coverage/local_report.py +9 -0
  21. databricks/labs/lakebridge/coverage/sqlglot_snow_transpilation_coverage.py +5 -0
  22. databricks/labs/lakebridge/coverage/sqlglot_tsql_transpilation_coverage.py +5 -0
  23. databricks/labs/lakebridge/deployment/__init__.py +0 -0
  24. databricks/labs/lakebridge/deployment/configurator.py +199 -0
  25. databricks/labs/lakebridge/deployment/dashboard.py +140 -0
  26. databricks/labs/lakebridge/deployment/installation.py +125 -0
  27. databricks/labs/lakebridge/deployment/job.py +147 -0
  28. databricks/labs/lakebridge/deployment/recon.py +145 -0
  29. databricks/labs/lakebridge/deployment/table.py +30 -0
  30. databricks/labs/lakebridge/deployment/upgrade_common.py +124 -0
  31. databricks/labs/lakebridge/discovery/table.py +36 -0
  32. databricks/labs/lakebridge/discovery/table_definition.py +23 -0
  33. databricks/labs/lakebridge/discovery/tsql_table_definition.py +185 -0
  34. databricks/labs/lakebridge/errors/exceptions.py +1 -0
  35. databricks/labs/lakebridge/helpers/__init__.py +0 -0
  36. databricks/labs/lakebridge/helpers/db_sql.py +24 -0
  37. databricks/labs/lakebridge/helpers/execution_time.py +20 -0
  38. databricks/labs/lakebridge/helpers/file_utils.py +64 -0
  39. databricks/labs/lakebridge/helpers/metastore.py +164 -0
  40. databricks/labs/lakebridge/helpers/recon_config_utils.py +176 -0
  41. databricks/labs/lakebridge/helpers/string_utils.py +62 -0
  42. databricks/labs/lakebridge/helpers/telemetry_utils.py +13 -0
  43. databricks/labs/lakebridge/helpers/validation.py +101 -0
  44. databricks/labs/lakebridge/install.py +849 -0
  45. databricks/labs/lakebridge/intermediate/__init__.py +0 -0
  46. databricks/labs/lakebridge/intermediate/dag.py +88 -0
  47. databricks/labs/lakebridge/intermediate/engine_adapter.py +0 -0
  48. databricks/labs/lakebridge/intermediate/root_tables.py +44 -0
  49. databricks/labs/lakebridge/jvmproxy.py +56 -0
  50. databricks/labs/lakebridge/lineage.py +42 -0
  51. databricks/labs/lakebridge/reconcile/__init__.py +0 -0
  52. databricks/labs/lakebridge/reconcile/compare.py +414 -0
  53. databricks/labs/lakebridge/reconcile/connectors/__init__.py +0 -0
  54. databricks/labs/lakebridge/reconcile/connectors/data_source.py +72 -0
  55. databricks/labs/lakebridge/reconcile/connectors/databricks.py +87 -0
  56. databricks/labs/lakebridge/reconcile/connectors/jdbc_reader.py +41 -0
  57. databricks/labs/lakebridge/reconcile/connectors/oracle.py +108 -0
  58. databricks/labs/lakebridge/reconcile/connectors/secrets.py +30 -0
  59. databricks/labs/lakebridge/reconcile/connectors/snowflake.py +173 -0
  60. databricks/labs/lakebridge/reconcile/connectors/source_adapter.py +30 -0
  61. databricks/labs/lakebridge/reconcile/connectors/sql_server.py +132 -0
  62. databricks/labs/lakebridge/reconcile/constants.py +37 -0
  63. databricks/labs/lakebridge/reconcile/exception.py +42 -0
  64. databricks/labs/lakebridge/reconcile/execute.py +920 -0
  65. databricks/labs/lakebridge/reconcile/query_builder/__init__.py +0 -0
  66. databricks/labs/lakebridge/reconcile/query_builder/aggregate_query.py +293 -0
  67. databricks/labs/lakebridge/reconcile/query_builder/base.py +138 -0
  68. databricks/labs/lakebridge/reconcile/query_builder/count_query.py +33 -0
  69. databricks/labs/lakebridge/reconcile/query_builder/expression_generator.py +292 -0
  70. databricks/labs/lakebridge/reconcile/query_builder/hash_query.py +91 -0
  71. databricks/labs/lakebridge/reconcile/query_builder/sampling_query.py +123 -0
  72. databricks/labs/lakebridge/reconcile/query_builder/threshold_query.py +231 -0
  73. databricks/labs/lakebridge/reconcile/recon_capture.py +635 -0
  74. databricks/labs/lakebridge/reconcile/recon_config.py +363 -0
  75. databricks/labs/lakebridge/reconcile/recon_output_config.py +85 -0
  76. databricks/labs/lakebridge/reconcile/runner.py +97 -0
  77. databricks/labs/lakebridge/reconcile/sampler.py +239 -0
  78. databricks/labs/lakebridge/reconcile/schema_compare.py +126 -0
  79. databricks/labs/lakebridge/resources/__init__.py +0 -0
  80. databricks/labs/lakebridge/resources/config/credentials.yml +33 -0
  81. databricks/labs/lakebridge/resources/reconcile/__init__.py +0 -0
  82. databricks/labs/lakebridge/resources/reconcile/dashboards/__init__.py +0 -0
  83. databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/00_0_aggregate_recon_header.md +6 -0
  84. databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/01_0_recon_id.filter.yml +6 -0
  85. databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/01_1_executed_by.filter.yml +5 -0
  86. databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/01_2_started_at.filter.yml +5 -0
  87. databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/02_0_source_type.filter.yml +5 -0
  88. databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/02_1_source_table.filter.yml +5 -0
  89. databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/02_2_target_table.filter.yml +5 -0
  90. databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/04_0_aggregate_summary_table.sql +46 -0
  91. databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/05_0_aggregate_recon_drilldown_header.md +2 -0
  92. databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/06_0_recon_id.filter.yml +5 -0
  93. databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/06_1_category.filter.yml +5 -0
  94. databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/06_2_aggregate_type.filter.yml +5 -0
  95. databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/07_0_target_table.filter.yml +4 -0
  96. databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/07_1_source_table.filter.yml +4 -0
  97. databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/08_0_aggregate_details_table.sql +92 -0
  98. databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/09_0_aggregate_missing_mismatch_header.md +1 -0
  99. databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/10_0_aggr_mismatched_records.sql +19 -0
  100. databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/11_0_aggr_missing_in_databricks.sql +19 -0
  101. databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/11_1_aggr_missing_in_source.sql +19 -0
  102. databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/dashboard.yml +365 -0
  103. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/00_0_recon_main.md +3 -0
  104. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/01_0_recon_id.filter.yml +6 -0
  105. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/01_1_report_type.filter.yml +5 -0
  106. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/01_2_executed_by.filter.yml +5 -0
  107. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/02_0_source_type.filter.yml +5 -0
  108. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/02_1_source_table.filter.yml +6 -0
  109. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/02_2_target_table.filter.yml +6 -0
  110. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/03_0_started_at.filter.yml +5 -0
  111. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/05_0_summary_table.sql +38 -0
  112. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/06_0_schema_comparison_header.md +3 -0
  113. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/07_0_schema_details_table.sql +42 -0
  114. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/08_0_drill_down_header.md +3 -0
  115. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/09_0_recon_id.filter.yml +4 -0
  116. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/09_1_category.filter.yml +4 -0
  117. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/10_0_target_table.filter.yml +4 -0
  118. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/10_1_source_table.filter.yml +4 -0
  119. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/11_0_recon_details_pivot.sql +40 -0
  120. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/12_0_daily_data_validation_issue_header.md +3 -0
  121. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/13_0_success_fail_.filter.yml +4 -0
  122. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/14_0_failed_recon_ids.sql +15 -0
  123. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/15_0_total_failed_runs.sql +10 -0
  124. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/15_1_failed_targets.sql +10 -0
  125. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/15_2_successful_targets.sql +10 -0
  126. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/16_0_missing_mismatch_header.md +1 -0
  127. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/17_0_mismatched_records.sql +14 -0
  128. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/17_1_threshold_mismatches.sql +14 -0
  129. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/18_0_missing_in_databricks.sql +14 -0
  130. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/18_1_missing_in_source.sql +14 -0
  131. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/dashboard.yml +545 -0
  132. databricks/labs/lakebridge/resources/reconcile/queries/__init__.py +0 -0
  133. databricks/labs/lakebridge/resources/reconcile/queries/installation/__init__.py +0 -0
  134. databricks/labs/lakebridge/resources/reconcile/queries/installation/aggregate_details.sql +7 -0
  135. databricks/labs/lakebridge/resources/reconcile/queries/installation/aggregate_metrics.sql +15 -0
  136. databricks/labs/lakebridge/resources/reconcile/queries/installation/aggregate_rules.sql +6 -0
  137. databricks/labs/lakebridge/resources/reconcile/queries/installation/details.sql +7 -0
  138. databricks/labs/lakebridge/resources/reconcile/queries/installation/main.sql +24 -0
  139. databricks/labs/lakebridge/resources/reconcile/queries/installation/metrics.sql +21 -0
  140. databricks/labs/lakebridge/transpiler/__init__.py +0 -0
  141. databricks/labs/lakebridge/transpiler/execute.py +423 -0
  142. databricks/labs/lakebridge/transpiler/lsp/__init__.py +0 -0
  143. databricks/labs/lakebridge/transpiler/lsp/lsp_engine.py +564 -0
  144. databricks/labs/lakebridge/transpiler/sqlglot/__init__.py +0 -0
  145. databricks/labs/lakebridge/transpiler/sqlglot/dialect_utils.py +30 -0
  146. databricks/labs/lakebridge/transpiler/sqlglot/generator/__init__.py +0 -0
  147. databricks/labs/lakebridge/transpiler/sqlglot/generator/databricks.py +771 -0
  148. databricks/labs/lakebridge/transpiler/sqlglot/lca_utils.py +138 -0
  149. databricks/labs/lakebridge/transpiler/sqlglot/local_expression.py +197 -0
  150. databricks/labs/lakebridge/transpiler/sqlglot/parsers/__init__.py +0 -0
  151. databricks/labs/lakebridge/transpiler/sqlglot/parsers/oracle.py +23 -0
  152. databricks/labs/lakebridge/transpiler/sqlglot/parsers/presto.py +202 -0
  153. databricks/labs/lakebridge/transpiler/sqlglot/parsers/snowflake.py +535 -0
  154. databricks/labs/lakebridge/transpiler/sqlglot/sqlglot_engine.py +203 -0
  155. databricks/labs/lakebridge/transpiler/transpile_engine.py +49 -0
  156. databricks/labs/lakebridge/transpiler/transpile_status.py +68 -0
  157. databricks/labs/lakebridge/uninstall.py +28 -0
  158. databricks/labs/lakebridge/upgrades/v0.4.0_add_main_table_operation_name_column.py +80 -0
  159. databricks/labs/lakebridge/upgrades/v0.6.0_alter_metrics_datatype.py +51 -0
  160. databricks_labs_lakebridge-0.10.0.dist-info/METADATA +58 -0
  161. databricks_labs_lakebridge-0.10.0.dist-info/RECORD +171 -0
  162. databricks_labs_lakebridge-0.10.0.dist-info/WHEEL +4 -0
  163. databricks_labs_lakebridge-0.10.0.dist-info/entry_points.txt +2 -0
  164. databricks_labs_lakebridge-0.10.0.dist-info/licenses/LICENSE +69 -0
  165. databricks_labs_lakebridge-0.10.0.dist-info/licenses/NOTICE +42 -0
  166. docs/lakebridge/src/components/Button.tsx +81 -0
  167. docs/lakebridge/src/css/custom.css +167 -0
  168. docs/lakebridge/src/css/table.css +20 -0
  169. docs/lakebridge/src/pages/index.tsx +57 -0
  170. docs/lakebridge/src/theme/Footer/index.tsx +24 -0
  171. docs/lakebridge/src/theme/Layout/index.tsx +18 -0
@@ -0,0 +1,203 @@
1
+ import logging
2
+ import typing as t
3
+ from collections.abc import Iterable
4
+ from dataclasses import dataclass
5
+ from pathlib import Path
6
+
7
+ from sqlglot import expressions as exp, parse, transpile, Dialect
8
+ from sqlglot.errors import ErrorLevel, ParseError, TokenError, UnsupportedError
9
+ from sqlglot.expressions import Expression
10
+ from sqlglot.tokens import Token, TokenType
11
+
12
+ from databricks.labs.lakebridge.config import TranspileResult, TranspileConfig
13
+ from databricks.labs.lakebridge.helpers.file_utils import is_sql_file
14
+ from databricks.labs.lakebridge.helpers.string_utils import format_error_message
15
+ from databricks.labs.lakebridge.transpiler.sqlglot import lca_utils
16
+ from databricks.labs.lakebridge.transpiler.sqlglot.dialect_utils import get_dialect
17
+ from databricks.labs.lakebridge.transpiler.sqlglot.dialect_utils import SQLGLOT_DIALECTS
18
+ from databricks.labs.lakebridge.transpiler.transpile_status import TranspileError, ErrorKind, ErrorSeverity
19
+ from databricks.labs.lakebridge.transpiler.transpile_engine import TranspileEngine
20
+
21
+ logger = logging.getLogger(__name__)
22
+
23
+
24
+ @dataclass
25
+ class ParsedExpression:
26
+ original_sql: str
27
+ parsed_expression: Expression
28
+
29
+
30
+ @dataclass
31
+ class ParserProblem:
32
+ original_sql: str
33
+ transpile_error: TranspileError
34
+
35
+
36
+ class SqlglotEngine(TranspileEngine):
37
+
38
+ @property
39
+ def supported_dialects(self) -> list[str]:
40
+ return sorted(SQLGLOT_DIALECTS.keys())
41
+
42
+ def _partial_transpile(
43
+ self,
44
+ read_dialect: Dialect,
45
+ write_dialect: Dialect,
46
+ source_code: str,
47
+ file_path: Path,
48
+ ) -> tuple[list[str], list[ParserProblem]]:
49
+ transpiled_sqls: list[str] = []
50
+ parsed_expressions, problem_list = self.safe_parse(read_dialect, source_code, file_path)
51
+ for parsed_expression in parsed_expressions:
52
+ try:
53
+ transpiled_sql = write_dialect.generate(parsed_expression.parsed_expression, pretty=True)
54
+ # Checking if the transpiled SQL is a comment and raise an error
55
+ if transpiled_sql.startswith("--"):
56
+ raise UnsupportedError("Unsupported SQL")
57
+ transpiled_sqls.append(transpiled_sql)
58
+ except TokenError as e:
59
+ error_msg = format_error_message("Token Error", e, parsed_expression.original_sql)
60
+ error = TranspileError("TOKEN_ERROR", ErrorKind.PARSING, ErrorSeverity.ERROR, file_path, error_msg)
61
+ problem_list.append(ParserProblem(parsed_expression.original_sql, error))
62
+ except ParseError as e:
63
+ error_msg = format_error_message("Parsing Error", e, parsed_expression.original_sql)
64
+ error = TranspileError("PARSE_ERROR", ErrorKind.PARSING, ErrorSeverity.ERROR, file_path, error_msg)
65
+ problem_list.append(ParserProblem(parsed_expression.original_sql, error))
66
+ except UnsupportedError as e:
67
+ error_msg = format_error_message("Unsupported SQL Error", e, parsed_expression.original_sql)
68
+ error = TranspileError("UNSUPPORTED_SQL", ErrorKind.PARSING, ErrorSeverity.ERROR, file_path, error_msg)
69
+ problem_list.append(ParserProblem(parsed_expression.original_sql, error))
70
+ return transpiled_sqls, problem_list
71
+
72
+ async def initialize(self, config: TranspileConfig) -> None:
73
+ pass
74
+
75
+ async def shutdown(self) -> None:
76
+ pass
77
+
78
+ async def transpile(
79
+ self, source_dialect: str, target_dialect: str, source_code: str, file_path: Path
80
+ ) -> TranspileResult:
81
+ read_dialect = get_dialect(source_dialect)
82
+ error: TranspileError | None = self._check_supported(read_dialect, source_code, file_path)
83
+ if error:
84
+ return TranspileResult(source_code, 1, [error])
85
+ write_dialect = get_dialect(target_dialect)
86
+ try:
87
+ transpiled_expressions = transpile(
88
+ source_code, read=read_dialect, write=write_dialect, pretty=True, error_level=ErrorLevel.RAISE
89
+ )
90
+ transpiled_code = "\n".join(transpiled_expressions)
91
+ return TranspileResult(transpiled_code, len(transpiled_expressions), [])
92
+ except (ParseError, TokenError, UnsupportedError) as e:
93
+ logger.error(f"Exception caught for file {file_path!s}: {e}")
94
+ transpiled_expressions, problems = self._partial_transpile(
95
+ read_dialect, write_dialect, source_code, file_path
96
+ )
97
+ transpiled_code = "\n".join(transpiled_expressions)
98
+ return TranspileResult(transpiled_code, 1, [problem.transpile_error for problem in problems])
99
+
100
+ def parse(
101
+ self, source_dialect: str, source_sql: str, file_path: Path
102
+ ) -> tuple[list[Expression | None] | None, TranspileError | None]:
103
+ expression = None
104
+ error = None
105
+ try:
106
+ expression = parse(source_sql, read=source_dialect, error_level=ErrorLevel.IMMEDIATE)
107
+ except TokenError as e:
108
+ error_msg = format_error_message("Token Error", e, source_sql)
109
+ error = TranspileError("TOKEN_ERROR", ErrorKind.PARSING, ErrorSeverity.ERROR, file_path, error_msg)
110
+ except ParseError as e:
111
+ error_msg = format_error_message("Parsing Error", e, source_sql)
112
+ error = TranspileError("PARSE_ERROR", ErrorKind.PARSING, ErrorSeverity.ERROR, file_path, error_msg)
113
+ except UnsupportedError as e:
114
+ error_msg = format_error_message("Unsupported SQL Error", e, source_sql)
115
+ error = TranspileError("UNSUPPORTED_SQL", ErrorKind.PARSING, ErrorSeverity.ERROR, file_path, error_msg)
116
+ return expression, error
117
+
118
+ def analyse_table_lineage(
119
+ self, source_dialect: str, source_code: str, file_path: Path
120
+ ) -> Iterable[tuple[str, str]]:
121
+ parsed_expression, _ = self.parse(source_dialect, source_code, file_path)
122
+ if parsed_expression is not None:
123
+ for expr in parsed_expression:
124
+ child: str = str(file_path)
125
+ if expr is not None:
126
+ # TODO: fix possible issue where the file reference is lost (if we have a 'create')
127
+ for change in expr.find_all(exp.Create, exp.Insert, exp.Merge, bfs=False):
128
+ child = self._find_root_table(change)
129
+
130
+ for query in expr.find_all(exp.Select, exp.Join, exp.With, bfs=False):
131
+ table = self._find_root_table(query)
132
+ if table:
133
+ yield table, child
134
+
135
+ def safe_parse(
136
+ self, read_dialect: Dialect, source_code: str, file_path: Path
137
+ ) -> tuple[list[ParsedExpression], list[ParserProblem]]:
138
+ try:
139
+ tokens = read_dialect.tokenize(sql=source_code)
140
+ return self._safe_parse(read_dialect, tokens, file_path)
141
+ except TokenError as e:
142
+ error_msg = format_error_message("Token error", e, source_code)
143
+ error = TranspileError("TOKEN_ERROR", ErrorKind.PARSING, ErrorSeverity.ERROR, file_path, error_msg)
144
+ return [], [ParserProblem(source_code, error)]
145
+
146
+ def _safe_parse(
147
+ self, read_dialect: Dialect, all_tokens: list[Token], file_path: Path
148
+ ) -> tuple[list[ParsedExpression], list[ParserProblem]]:
149
+ chunks = self._make_chunks(all_tokens)
150
+ parsed_expressions: list[ParsedExpression] = []
151
+ problems: list[ParserProblem] = []
152
+ parser_opts = {"error_level": ErrorLevel.RAISE}
153
+ parser = read_dialect.parser(**parser_opts)
154
+ for sql, tokens in chunks:
155
+ try:
156
+ expressions = parser.parse(tokens)
157
+ expression = t.cast(Expression, expressions[0])
158
+ parsed_expressions.append(ParsedExpression(sql, expression))
159
+ except TokenError as e:
160
+ error_msg = format_error_message("Token error", e, sql)
161
+ error = TranspileError("TOKEN_ERROR", ErrorKind.PARSING, ErrorSeverity.ERROR, file_path, error_msg)
162
+ problems.append(ParserProblem(sql, error))
163
+ except ParseError as e:
164
+ error_msg = format_error_message("Parsing error", e, sql)
165
+ error = TranspileError("PARSE_ERROR", ErrorKind.PARSING, ErrorSeverity.ERROR, file_path, error_msg)
166
+ problems.append(ParserProblem(sql, error))
167
+ except UnsupportedError as e:
168
+ error_msg = format_error_message("Unsupported SQL error", e, sql)
169
+ error = TranspileError("UNSUPPORTED_SQL", ErrorKind.PARSING, ErrorSeverity.ERROR, file_path, error_msg)
170
+ problems.append(ParserProblem(sql, error))
171
+ finally:
172
+ parser.reset()
173
+ return parsed_expressions, problems
174
+
175
+ @staticmethod
176
+ def _make_chunks(tokens: list[Token]) -> list[tuple[str, list[Token]]]:
177
+ chunks: list[tuple[str, list[Token]]] = []
178
+ current_chunk: list[Token] = []
179
+ # Split tokens into chunks based on semicolons(or other separators)
180
+ # Need to define the separator in Class Tokenizer
181
+ for token in tokens:
182
+ current_chunk.append(token)
183
+ if token.token_type in {TokenType.SEMICOLON}:
184
+ original_sql = " ".join([token.text for token in current_chunk]).strip()
185
+ chunks.append((original_sql, current_chunk))
186
+ # reset
187
+ current_chunk = []
188
+ # don't forget the last chunk
189
+ if current_chunk:
190
+ original_sql = " ".join([token.text for token in current_chunk]).strip()
191
+ chunks.append((original_sql, current_chunk))
192
+ return chunks
193
+
194
+ @staticmethod
195
+ def _find_root_table(expression) -> str:
196
+ table = expression.find(exp.Table, bfs=False)
197
+ return table.name if table else ""
198
+
199
+ def _check_supported(self, source_dialect: Dialect, source_code: str, file_path: Path) -> TranspileError | None:
200
+ return lca_utils.check_for_unsupported_lca(source_dialect, source_code, file_path)
201
+
202
+ def is_supported_file(self, file: Path) -> bool:
203
+ return is_sql_file(file)
@@ -0,0 +1,49 @@
1
+ from __future__ import annotations
2
+ import abc
3
+ from pathlib import Path
4
+
5
+ from databricks.labs.lakebridge.config import TranspileResult, TranspileConfig
6
+
7
+
8
+ class TranspileEngine(abc.ABC):
9
+
10
+ @classmethod
11
+ def load_engine(cls, transpiler_config_path: Path) -> TranspileEngine:
12
+ # TODO remove this once sqlglot transpiler is pluggable
13
+ if str(transpiler_config_path) == "sqlglot":
14
+ # pylint: disable=import-outside-toplevel, cyclic-import
15
+ from databricks.labs.lakebridge.transpiler.sqlglot.sqlglot_engine import SqlglotEngine
16
+
17
+ return SqlglotEngine()
18
+ if not transpiler_config_path.exists():
19
+ raise ValueError(
20
+ f"Error: Invalid value for '--transpiler-config-path': '{str(transpiler_config_path)}', file does not exist."
21
+ )
22
+ # pylint: disable=import-outside-toplevel, cyclic-import
23
+ from databricks.labs.lakebridge.transpiler.lsp.lsp_engine import LSPEngine
24
+
25
+ return LSPEngine.from_config_path(transpiler_config_path)
26
+
27
+ @abc.abstractmethod
28
+ async def initialize(self, config: TranspileConfig) -> None: ...
29
+
30
+ @abc.abstractmethod
31
+ async def shutdown(self) -> None: ...
32
+
33
+ @abc.abstractmethod
34
+ async def transpile(
35
+ self, source_dialect: str, target_dialect: str, source_code: str, file_path: Path
36
+ ) -> TranspileResult: ...
37
+
38
+ @property
39
+ @abc.abstractmethod
40
+ def supported_dialects(self) -> list[str]: ...
41
+
42
+ def check_source_dialect(self, source_dialect: str | None) -> None:
43
+ if source_dialect not in self.supported_dialects:
44
+ raise ValueError(
45
+ f"Invalid value for '--source-dialect': '{source_dialect}' is not one of {self.supported_dialects}."
46
+ )
47
+
48
+ @abc.abstractmethod
49
+ def is_supported_file(self, file: Path) -> bool: ...
@@ -0,0 +1,68 @@
1
+ from __future__ import annotations
2
+
3
+ from dataclasses import dataclass
4
+ from enum import Enum
5
+ from pathlib import Path
6
+
7
+
8
+ # not using StrEnum because they only appear with Python 3.11
9
+ class ErrorSeverity(Enum):
10
+ INFO = "INFO"
11
+ WARNING = "WARNING"
12
+ ERROR = "ERROR"
13
+
14
+
15
+ class ErrorKind(Enum):
16
+ ANALYSIS = "ANALYSIS"
17
+ PARSING = "PARSING"
18
+ GENERATION = "GENERATION"
19
+ VALIDATION = "VALIDATION"
20
+ INTERNAL = "INTERNAL"
21
+
22
+
23
+ @dataclass
24
+ class CodePosition:
25
+ line: int # 0-based line number
26
+ character: int # 0-based character number
27
+
28
+
29
+ @dataclass
30
+ class CodeRange:
31
+ start: CodePosition
32
+ end: CodePosition
33
+
34
+
35
+ @dataclass
36
+ class TranspileError:
37
+ code: str
38
+ kind: ErrorKind
39
+ severity: ErrorSeverity
40
+ path: Path
41
+ message: str
42
+ range: CodeRange | None = None
43
+
44
+ def __str__(self):
45
+ return f"{type(self).__name__}(code={self.code}, kind={self.kind.name}, severity={self.severity.name}, path='{self.path!s}', message='{self.message}')"
46
+
47
+
48
+ @dataclass
49
+ class TranspileStatus:
50
+ file_list: list[Path]
51
+ no_of_transpiled_queries: int
52
+ error_list: list[TranspileError]
53
+
54
+ @property
55
+ def analysis_error_count(self) -> int:
56
+ return len([error for error in self.error_list if error.kind == ErrorKind.ANALYSIS])
57
+
58
+ @property
59
+ def parsing_error_count(self) -> int:
60
+ return len([error for error in self.error_list if error.kind == ErrorKind.PARSING])
61
+
62
+ @property
63
+ def generation_error_count(self) -> int:
64
+ return len([error for error in self.error_list if error.kind == ErrorKind.GENERATION])
65
+
66
+ @property
67
+ def validation_error_count(self) -> int:
68
+ return len([error for error in self.error_list if error.kind == ErrorKind.VALIDATION])
@@ -0,0 +1,28 @@
1
+ import logging
2
+
3
+ from databricks.labs.blueprint.entrypoint import is_in_debug
4
+ from databricks.sdk import WorkspaceClient
5
+
6
+ from databricks.labs.lakebridge.__about__ import __version__
7
+ from databricks.labs.lakebridge.contexts.application import ApplicationContext
8
+
9
+ logger = logging.getLogger("databricks.labs.lakebridge.install")
10
+
11
+
12
+ def run(context: ApplicationContext):
13
+ context.workspace_installation.uninstall(context.remorph_config)
14
+
15
+
16
+ if __name__ == "__main__":
17
+ logger.setLevel("INFO")
18
+ if is_in_debug():
19
+ logging.getLogger("databricks").setLevel(logging.DEBUG)
20
+
21
+ run(
22
+ ApplicationContext(
23
+ WorkspaceClient(
24
+ product="remorph",
25
+ product_version=__version__,
26
+ )
27
+ )
28
+ )
@@ -0,0 +1,80 @@
1
+ # pylint: disable=invalid-name
2
+ import logging
3
+
4
+
5
+ from databricks.labs.blueprint.installation import Installation
6
+ from databricks.sdk import WorkspaceClient
7
+
8
+ from databricks.labs.lakebridge.contexts.application import ApplicationContext
9
+ from databricks.labs.lakebridge.deployment.recon import RECON_JOB_NAME
10
+ from databricks.labs.lakebridge.helpers import db_sql
11
+
12
+ from databricks.labs.lakebridge.deployment.upgrade_common import (
13
+ current_table_columns,
14
+ installed_table_columns,
15
+ recreate_table_sql,
16
+ )
17
+
18
+ logger = logging.getLogger(__name__)
19
+
20
+
21
+ def _check_table_mismatch(
22
+ installed_table,
23
+ current_table,
24
+ ) -> bool:
25
+ current_table = [x for x in current_table if x != "operation_name"]
26
+ # Compare the current main table columns with the installed main table columns
27
+ if "operation_name" in installed_table and len(sorted(installed_table)) != len(sorted(current_table)):
28
+ return True
29
+ return False
30
+
31
+
32
+ def _upgrade_reconcile_metadata_main_table(
33
+ installation: Installation,
34
+ ws: WorkspaceClient,
35
+ app_context: ApplicationContext,
36
+ ):
37
+ """
38
+ Add operation_name column to the main table as part of the upgrade process.
39
+ - Compare the current main table columns with the installed main table columns. If there is any mismatch:
40
+ * Verify all the current main table columns are present in the installed main table and then use CTAS to recreate the main table
41
+ * If any of the current main table columns are missing in the installed main table, prompt the user to recreate the main table:
42
+ - If the user confirms, recreate the main table using the main DDL file, else log an error message and exit
43
+ :param installation:
44
+ :param ws:
45
+ :param app_context:
46
+ """
47
+ reconcile_config = app_context.recon_config
48
+ assert reconcile_config, "Reconcile config must be present to upgrade the reconcile metadata main table"
49
+ table_name = "main"
50
+ table_identifier = (
51
+ f"{reconcile_config.metadata_config.catalog}.{reconcile_config.metadata_config.schema}.{table_name}"
52
+ )
53
+ installed_columns = installed_table_columns(ws, table_identifier)
54
+ current_columns = current_table_columns(table_name, table_identifier)
55
+ sql: str | None = f"ALTER TABLE {table_identifier} ADD COLUMN operation_name STRING AFTER report_type"
56
+ if _check_table_mismatch(installed_columns, current_columns):
57
+ logger.info("Recreating main table")
58
+ sql = recreate_table_sql(table_identifier, installed_columns, current_columns, app_context.prompts)
59
+ if sql:
60
+ logger.debug(f"Executing SQL to upgrade main table: \n{sql}")
61
+ db_sql.get_sql_backend(ws).execute(sql)
62
+ installation.save(reconcile_config)
63
+ logger.debug("Upgraded Reconcile main table")
64
+
65
+
66
+ def _upgrade_reconcile_workflow(app_context: ApplicationContext):
67
+ if app_context.recon_config:
68
+ logger.info("Upgrading reconcile workflow")
69
+ wheels = app_context.product_info.wheels(app_context.workspace_client)
70
+ with wheels as wheel_builder:
71
+ wheel_path = f"/Workspace{wheel_builder.upload_to_wsfs()}"
72
+ app_context.job_deployment.deploy_recon_job(RECON_JOB_NAME, app_context.recon_config, wheel_path)
73
+ logger.debug("Upgraded reconcile workflow")
74
+
75
+
76
+ def upgrade(installation: Installation, ws: WorkspaceClient):
77
+ app_context = ApplicationContext(ws)
78
+ if app_context.recon_config is not None:
79
+ _upgrade_reconcile_metadata_main_table(installation, ws, app_context)
80
+ _upgrade_reconcile_workflow(app_context)
@@ -0,0 +1,51 @@
1
+ # pylint: disable=invalid-name
2
+ import logging
3
+
4
+ from databricks.labs.blueprint.installation import Installation
5
+ from databricks.sdk import WorkspaceClient
6
+
7
+ from databricks.labs.lakebridge.contexts.application import ApplicationContext
8
+ from databricks.labs.lakebridge.deployment.upgrade_common import (
9
+ current_table_columns,
10
+ installed_table_columns,
11
+ check_table_mismatch,
12
+ recreate_table_sql,
13
+ )
14
+ from databricks.labs.lakebridge.helpers import db_sql
15
+
16
+ logger = logging.getLogger(__name__)
17
+
18
+
19
+ def _upgrade_reconcile_metadata_metrics_table(
20
+ installation: Installation, ws: WorkspaceClient, app_context: ApplicationContext
21
+ ):
22
+ reconcile_config = app_context.recon_config
23
+ assert reconcile_config, "Reconcile config must be present to upgrade the reconcile metadata main table"
24
+ table_name = "metrics"
25
+ table_identifier = (
26
+ f"{reconcile_config.metadata_config.catalog}.{reconcile_config.metadata_config.schema}.{table_name}"
27
+ )
28
+ installed_columns = installed_table_columns(ws, table_identifier)
29
+ current_columns = current_table_columns(table_name, table_identifier)
30
+ sqls: list | None = [
31
+ f"ALTER TABLE {table_identifier} SET TBLPROPERTIES ('delta.enableTypeWidening' = 'true')",
32
+ f"ALTER TABLE {table_identifier} ALTER COLUMN recon_metrics.row_comparison.missing_in_source TYPE BIGINT",
33
+ f"ALTER TABLE {table_identifier} ALTER COLUMN recon_metrics.row_comparison.missing_in_target TYPE BIGINT",
34
+ f"ALTER TABLE {table_identifier} ALTER COLUMN recon_metrics.column_comparison.absolute_mismatch TYPE BIGINT",
35
+ f"ALTER TABLE {table_identifier} ALTER COLUMN recon_metrics.column_comparison.threshold_mismatch TYPE BIGINT",
36
+ ]
37
+ if check_table_mismatch(installed_columns, current_columns):
38
+ logger.info("Recreating main table")
39
+ sqls = [recreate_table_sql(table_identifier, installed_columns, current_columns, app_context.prompts)]
40
+ if sqls:
41
+ for sql in sqls:
42
+ logger.debug(f"Executing SQL to upgrade metrics table: \n{sql}")
43
+ db_sql.get_sql_backend(ws).execute(sql)
44
+ installation.save(reconcile_config)
45
+ logger.debug("Upgraded Reconcile metrics table")
46
+
47
+
48
+ def upgrade(installation: Installation, ws: WorkspaceClient):
49
+ app_context = ApplicationContext(ws)
50
+ if app_context.recon_config is not None:
51
+ _upgrade_reconcile_metadata_metrics_table(installation, ws, app_context)
@@ -0,0 +1,58 @@
1
+ Metadata-Version: 2.4
2
+ Name: databricks-labs-lakebridge
3
+ Version: 0.10.0
4
+ Summary: Fast and predictable migrations to Databricks Lakehouse Platform. This tool is designed to help you migrate your data and workloads to the Databricks Lakehouse Platform in a fast, predictable, and reliable way. It provides a set of tools and utilities to help you reconcile your data and workloads, assess your current state, and plan your migration.
5
+ Project-URL: Documentation, https://github.com/databrickslabs/lakebridge
6
+ Project-URL: Issues, https://github.com/databrickslabs/lakebridge/issues
7
+ Project-URL: Source, https://github.com/databrickslabs/lakebridge
8
+ Maintainer-email: Databricks Labs <labs-oss@databricks.com>
9
+ License-File: LICENSE
10
+ License-File: NOTICE
11
+ Keywords: Databricks
12
+ Classifier: Development Status :: 4 - Beta
13
+ Classifier: Environment :: Console
14
+ Classifier: Framework :: Pytest
15
+ Classifier: Intended Audience :: Developers
16
+ Classifier: Intended Audience :: System Administrators
17
+ Classifier: License :: Other/Proprietary License
18
+ Classifier: Operating System :: MacOS
19
+ Classifier: Operating System :: Microsoft :: Windows
20
+ Classifier: Programming Language :: Python
21
+ Classifier: Programming Language :: Python :: 3.10
22
+ Classifier: Programming Language :: Python :: 3.11
23
+ Classifier: Programming Language :: Python :: Implementation :: CPython
24
+ Classifier: Topic :: Software Development :: Libraries
25
+ Classifier: Topic :: Utilities
26
+ Requires-Python: >=3.10
27
+ Requires-Dist: cryptography<45.1.0,>=44.0.2
28
+ Requires-Dist: databricks-bb-analyzer~=0.1.6
29
+ Requires-Dist: databricks-labs-blueprint[yaml]<0.12.0,>=0.11.0
30
+ Requires-Dist: databricks-labs-lsql==0.16.0
31
+ Requires-Dist: databricks-sdk~=0.51.0
32
+ Requires-Dist: duckdb~=1.2.2
33
+ Requires-Dist: pygls~=2.0.0a2
34
+ Requires-Dist: pyodbc~=5.2.0
35
+ Requires-Dist: sqlalchemy~=2.0.40
36
+ Requires-Dist: sqlglot==26.1.3
37
+ Requires-Dist: standard-distutils~=3.11.9; python_version >= '3.11'
38
+ Description-Content-Type: text/markdown
39
+
40
+ Databricks Labs Lakebridge
41
+ ---
42
+ ![Databricks Labs Lakebridge White](/docs/lakebridge/static/img/lakebridge-lockup-white-background.svg)
43
+
44
+
45
+ [![build](https://github.com/databrickslabs/remorph/actions/workflows/push.yml/badge.svg)](https://github.com/databrickslabs/remorph/actions/workflows/push.yml)
46
+ ![PyPI - Downloads](https://img.shields.io/pypi/dm/databricks-labs-remorph?cacheSeconds=3600)
47
+
48
+ -----
49
+ Documentation
50
+ The complete documentation is available at: https://databrickslabs.github.io/lakebridge/
51
+
52
+ Contribution
53
+ Please see the contribution guidance here on how to contribute to the project (build, test, and submit a PR).
54
+
55
+ Project Support
56
+ Please note that this project is provided for your exploration only and is not formally supported by Databricks with Service Level Agreements (SLAs). They are provided AS-IS, and we do not make any guarantees. Please do not submit a support ticket relating to any issues arising from the use of this project.
57
+
58
+ Any issues discovered through the use of this project should be filed as GitHub Issues on this repository. They will be reviewed as time permits, but no formal SLAs for support exist.