databricks-labs-lakebridge 0.10.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (171) hide show
  1. databricks/__init__.py +3 -0
  2. databricks/labs/__init__.py +3 -0
  3. databricks/labs/lakebridge/__about__.py +2 -0
  4. databricks/labs/lakebridge/__init__.py +11 -0
  5. databricks/labs/lakebridge/assessments/configure_assessment.py +194 -0
  6. databricks/labs/lakebridge/assessments/pipeline.py +188 -0
  7. databricks/labs/lakebridge/assessments/profiler_config.py +30 -0
  8. databricks/labs/lakebridge/base_install.py +12 -0
  9. databricks/labs/lakebridge/cli.py +449 -0
  10. databricks/labs/lakebridge/config.py +192 -0
  11. databricks/labs/lakebridge/connections/__init__.py +0 -0
  12. databricks/labs/lakebridge/connections/credential_manager.py +89 -0
  13. databricks/labs/lakebridge/connections/database_manager.py +98 -0
  14. databricks/labs/lakebridge/connections/env_getter.py +13 -0
  15. databricks/labs/lakebridge/contexts/__init__.py +0 -0
  16. databricks/labs/lakebridge/contexts/application.py +133 -0
  17. databricks/labs/lakebridge/coverage/__init__.py +0 -0
  18. databricks/labs/lakebridge/coverage/commons.py +223 -0
  19. databricks/labs/lakebridge/coverage/lakebridge_snow_transpilation_coverage.py +29 -0
  20. databricks/labs/lakebridge/coverage/local_report.py +9 -0
  21. databricks/labs/lakebridge/coverage/sqlglot_snow_transpilation_coverage.py +5 -0
  22. databricks/labs/lakebridge/coverage/sqlglot_tsql_transpilation_coverage.py +5 -0
  23. databricks/labs/lakebridge/deployment/__init__.py +0 -0
  24. databricks/labs/lakebridge/deployment/configurator.py +199 -0
  25. databricks/labs/lakebridge/deployment/dashboard.py +140 -0
  26. databricks/labs/lakebridge/deployment/installation.py +125 -0
  27. databricks/labs/lakebridge/deployment/job.py +147 -0
  28. databricks/labs/lakebridge/deployment/recon.py +145 -0
  29. databricks/labs/lakebridge/deployment/table.py +30 -0
  30. databricks/labs/lakebridge/deployment/upgrade_common.py +124 -0
  31. databricks/labs/lakebridge/discovery/table.py +36 -0
  32. databricks/labs/lakebridge/discovery/table_definition.py +23 -0
  33. databricks/labs/lakebridge/discovery/tsql_table_definition.py +185 -0
  34. databricks/labs/lakebridge/errors/exceptions.py +1 -0
  35. databricks/labs/lakebridge/helpers/__init__.py +0 -0
  36. databricks/labs/lakebridge/helpers/db_sql.py +24 -0
  37. databricks/labs/lakebridge/helpers/execution_time.py +20 -0
  38. databricks/labs/lakebridge/helpers/file_utils.py +64 -0
  39. databricks/labs/lakebridge/helpers/metastore.py +164 -0
  40. databricks/labs/lakebridge/helpers/recon_config_utils.py +176 -0
  41. databricks/labs/lakebridge/helpers/string_utils.py +62 -0
  42. databricks/labs/lakebridge/helpers/telemetry_utils.py +13 -0
  43. databricks/labs/lakebridge/helpers/validation.py +101 -0
  44. databricks/labs/lakebridge/install.py +849 -0
  45. databricks/labs/lakebridge/intermediate/__init__.py +0 -0
  46. databricks/labs/lakebridge/intermediate/dag.py +88 -0
  47. databricks/labs/lakebridge/intermediate/engine_adapter.py +0 -0
  48. databricks/labs/lakebridge/intermediate/root_tables.py +44 -0
  49. databricks/labs/lakebridge/jvmproxy.py +56 -0
  50. databricks/labs/lakebridge/lineage.py +42 -0
  51. databricks/labs/lakebridge/reconcile/__init__.py +0 -0
  52. databricks/labs/lakebridge/reconcile/compare.py +414 -0
  53. databricks/labs/lakebridge/reconcile/connectors/__init__.py +0 -0
  54. databricks/labs/lakebridge/reconcile/connectors/data_source.py +72 -0
  55. databricks/labs/lakebridge/reconcile/connectors/databricks.py +87 -0
  56. databricks/labs/lakebridge/reconcile/connectors/jdbc_reader.py +41 -0
  57. databricks/labs/lakebridge/reconcile/connectors/oracle.py +108 -0
  58. databricks/labs/lakebridge/reconcile/connectors/secrets.py +30 -0
  59. databricks/labs/lakebridge/reconcile/connectors/snowflake.py +173 -0
  60. databricks/labs/lakebridge/reconcile/connectors/source_adapter.py +30 -0
  61. databricks/labs/lakebridge/reconcile/connectors/sql_server.py +132 -0
  62. databricks/labs/lakebridge/reconcile/constants.py +37 -0
  63. databricks/labs/lakebridge/reconcile/exception.py +42 -0
  64. databricks/labs/lakebridge/reconcile/execute.py +920 -0
  65. databricks/labs/lakebridge/reconcile/query_builder/__init__.py +0 -0
  66. databricks/labs/lakebridge/reconcile/query_builder/aggregate_query.py +293 -0
  67. databricks/labs/lakebridge/reconcile/query_builder/base.py +138 -0
  68. databricks/labs/lakebridge/reconcile/query_builder/count_query.py +33 -0
  69. databricks/labs/lakebridge/reconcile/query_builder/expression_generator.py +292 -0
  70. databricks/labs/lakebridge/reconcile/query_builder/hash_query.py +91 -0
  71. databricks/labs/lakebridge/reconcile/query_builder/sampling_query.py +123 -0
  72. databricks/labs/lakebridge/reconcile/query_builder/threshold_query.py +231 -0
  73. databricks/labs/lakebridge/reconcile/recon_capture.py +635 -0
  74. databricks/labs/lakebridge/reconcile/recon_config.py +363 -0
  75. databricks/labs/lakebridge/reconcile/recon_output_config.py +85 -0
  76. databricks/labs/lakebridge/reconcile/runner.py +97 -0
  77. databricks/labs/lakebridge/reconcile/sampler.py +239 -0
  78. databricks/labs/lakebridge/reconcile/schema_compare.py +126 -0
  79. databricks/labs/lakebridge/resources/__init__.py +0 -0
  80. databricks/labs/lakebridge/resources/config/credentials.yml +33 -0
  81. databricks/labs/lakebridge/resources/reconcile/__init__.py +0 -0
  82. databricks/labs/lakebridge/resources/reconcile/dashboards/__init__.py +0 -0
  83. databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/00_0_aggregate_recon_header.md +6 -0
  84. databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/01_0_recon_id.filter.yml +6 -0
  85. databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/01_1_executed_by.filter.yml +5 -0
  86. databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/01_2_started_at.filter.yml +5 -0
  87. databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/02_0_source_type.filter.yml +5 -0
  88. databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/02_1_source_table.filter.yml +5 -0
  89. databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/02_2_target_table.filter.yml +5 -0
  90. databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/04_0_aggregate_summary_table.sql +46 -0
  91. databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/05_0_aggregate_recon_drilldown_header.md +2 -0
  92. databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/06_0_recon_id.filter.yml +5 -0
  93. databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/06_1_category.filter.yml +5 -0
  94. databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/06_2_aggregate_type.filter.yml +5 -0
  95. databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/07_0_target_table.filter.yml +4 -0
  96. databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/07_1_source_table.filter.yml +4 -0
  97. databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/08_0_aggregate_details_table.sql +92 -0
  98. databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/09_0_aggregate_missing_mismatch_header.md +1 -0
  99. databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/10_0_aggr_mismatched_records.sql +19 -0
  100. databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/11_0_aggr_missing_in_databricks.sql +19 -0
  101. databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/11_1_aggr_missing_in_source.sql +19 -0
  102. databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/dashboard.yml +365 -0
  103. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/00_0_recon_main.md +3 -0
  104. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/01_0_recon_id.filter.yml +6 -0
  105. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/01_1_report_type.filter.yml +5 -0
  106. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/01_2_executed_by.filter.yml +5 -0
  107. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/02_0_source_type.filter.yml +5 -0
  108. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/02_1_source_table.filter.yml +6 -0
  109. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/02_2_target_table.filter.yml +6 -0
  110. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/03_0_started_at.filter.yml +5 -0
  111. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/05_0_summary_table.sql +38 -0
  112. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/06_0_schema_comparison_header.md +3 -0
  113. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/07_0_schema_details_table.sql +42 -0
  114. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/08_0_drill_down_header.md +3 -0
  115. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/09_0_recon_id.filter.yml +4 -0
  116. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/09_1_category.filter.yml +4 -0
  117. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/10_0_target_table.filter.yml +4 -0
  118. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/10_1_source_table.filter.yml +4 -0
  119. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/11_0_recon_details_pivot.sql +40 -0
  120. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/12_0_daily_data_validation_issue_header.md +3 -0
  121. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/13_0_success_fail_.filter.yml +4 -0
  122. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/14_0_failed_recon_ids.sql +15 -0
  123. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/15_0_total_failed_runs.sql +10 -0
  124. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/15_1_failed_targets.sql +10 -0
  125. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/15_2_successful_targets.sql +10 -0
  126. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/16_0_missing_mismatch_header.md +1 -0
  127. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/17_0_mismatched_records.sql +14 -0
  128. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/17_1_threshold_mismatches.sql +14 -0
  129. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/18_0_missing_in_databricks.sql +14 -0
  130. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/18_1_missing_in_source.sql +14 -0
  131. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/dashboard.yml +545 -0
  132. databricks/labs/lakebridge/resources/reconcile/queries/__init__.py +0 -0
  133. databricks/labs/lakebridge/resources/reconcile/queries/installation/__init__.py +0 -0
  134. databricks/labs/lakebridge/resources/reconcile/queries/installation/aggregate_details.sql +7 -0
  135. databricks/labs/lakebridge/resources/reconcile/queries/installation/aggregate_metrics.sql +15 -0
  136. databricks/labs/lakebridge/resources/reconcile/queries/installation/aggregate_rules.sql +6 -0
  137. databricks/labs/lakebridge/resources/reconcile/queries/installation/details.sql +7 -0
  138. databricks/labs/lakebridge/resources/reconcile/queries/installation/main.sql +24 -0
  139. databricks/labs/lakebridge/resources/reconcile/queries/installation/metrics.sql +21 -0
  140. databricks/labs/lakebridge/transpiler/__init__.py +0 -0
  141. databricks/labs/lakebridge/transpiler/execute.py +423 -0
  142. databricks/labs/lakebridge/transpiler/lsp/__init__.py +0 -0
  143. databricks/labs/lakebridge/transpiler/lsp/lsp_engine.py +564 -0
  144. databricks/labs/lakebridge/transpiler/sqlglot/__init__.py +0 -0
  145. databricks/labs/lakebridge/transpiler/sqlglot/dialect_utils.py +30 -0
  146. databricks/labs/lakebridge/transpiler/sqlglot/generator/__init__.py +0 -0
  147. databricks/labs/lakebridge/transpiler/sqlglot/generator/databricks.py +771 -0
  148. databricks/labs/lakebridge/transpiler/sqlglot/lca_utils.py +138 -0
  149. databricks/labs/lakebridge/transpiler/sqlglot/local_expression.py +197 -0
  150. databricks/labs/lakebridge/transpiler/sqlglot/parsers/__init__.py +0 -0
  151. databricks/labs/lakebridge/transpiler/sqlglot/parsers/oracle.py +23 -0
  152. databricks/labs/lakebridge/transpiler/sqlglot/parsers/presto.py +202 -0
  153. databricks/labs/lakebridge/transpiler/sqlglot/parsers/snowflake.py +535 -0
  154. databricks/labs/lakebridge/transpiler/sqlglot/sqlglot_engine.py +203 -0
  155. databricks/labs/lakebridge/transpiler/transpile_engine.py +49 -0
  156. databricks/labs/lakebridge/transpiler/transpile_status.py +68 -0
  157. databricks/labs/lakebridge/uninstall.py +28 -0
  158. databricks/labs/lakebridge/upgrades/v0.4.0_add_main_table_operation_name_column.py +80 -0
  159. databricks/labs/lakebridge/upgrades/v0.6.0_alter_metrics_datatype.py +51 -0
  160. databricks_labs_lakebridge-0.10.0.dist-info/METADATA +58 -0
  161. databricks_labs_lakebridge-0.10.0.dist-info/RECORD +171 -0
  162. databricks_labs_lakebridge-0.10.0.dist-info/WHEEL +4 -0
  163. databricks_labs_lakebridge-0.10.0.dist-info/entry_points.txt +2 -0
  164. databricks_labs_lakebridge-0.10.0.dist-info/licenses/LICENSE +69 -0
  165. databricks_labs_lakebridge-0.10.0.dist-info/licenses/NOTICE +42 -0
  166. docs/lakebridge/src/components/Button.tsx +81 -0
  167. docs/lakebridge/src/css/custom.css +167 -0
  168. docs/lakebridge/src/css/table.css +20 -0
  169. docs/lakebridge/src/pages/index.tsx +57 -0
  170. docs/lakebridge/src/theme/Footer/index.tsx +24 -0
  171. docs/lakebridge/src/theme/Layout/index.tsx +18 -0
@@ -0,0 +1,292 @@
1
+ from collections.abc import Callable
2
+ from functools import partial
3
+
4
+ from pyspark.sql.types import DataType, NumericType
5
+ from sqlglot import Dialect
6
+ from sqlglot import expressions as exp
7
+
8
+ from databricks.labs.lakebridge.transpiler.sqlglot.dialect_utils import get_dialect
9
+ from databricks.labs.lakebridge.reconcile.recon_config import HashAlgoMapping
10
+
11
+
12
+ def _apply_func_expr(expr: exp.Expression, expr_func: Callable, **kwargs) -> exp.Expression:
13
+ is_terminal = isinstance(expr, exp.Column)
14
+ new_expr = expr.copy()
15
+ for node in new_expr.dfs():
16
+ if isinstance(node, exp.Column):
17
+ column_name = node.name
18
+ table_name = node.table
19
+ func = expr_func(this=exp.Column(this=column_name, table=table_name), **kwargs)
20
+ if is_terminal:
21
+ return func
22
+ node.replace(func)
23
+ return new_expr
24
+
25
+
26
+ def concat(expr: list[exp.Expression]) -> exp.Expression:
27
+ return exp.Concat(expressions=expr, safe=True)
28
+
29
+
30
+ def sha2(expr: exp.Expression, num_bits: str, is_expr: bool = False) -> exp.Expression:
31
+ if is_expr:
32
+ return exp.SHA2(this=expr, length=exp.Literal(this=num_bits, is_string=False))
33
+ return _apply_func_expr(expr, exp.SHA2, length=exp.Literal(this=num_bits, is_string=False))
34
+
35
+
36
+ def md5(expr: exp.Expression, is_expr: bool = False) -> exp.Expression:
37
+ if is_expr:
38
+ return exp.MD5(this=expr)
39
+ return _apply_func_expr(expr, exp.MD5)
40
+
41
+
42
+ def lower(expr: exp.Expression, is_expr: bool = False) -> exp.Expression:
43
+ if is_expr:
44
+ return exp.Lower(this=expr)
45
+ return _apply_func_expr(expr, exp.Lower)
46
+
47
+
48
+ def coalesce(expr: exp.Expression, default="0", is_string=False) -> exp.Expression:
49
+ expressions = [exp.Literal(this=default, is_string=is_string)]
50
+ return _apply_func_expr(expr, exp.Coalesce, expressions=expressions)
51
+
52
+
53
+ def trim(expr: exp.Expression) -> exp.Trim | exp.Expression:
54
+ return _apply_func_expr(expr, exp.Trim)
55
+
56
+
57
+ def json_format(expr: exp.Expression, options: dict[str, str] | None = None) -> exp.Expression:
58
+ return _apply_func_expr(expr, exp.JSONFormat, options=options)
59
+
60
+
61
+ def sort_array(expr: exp.Expression, asc=True) -> exp.Expression:
62
+ return _apply_func_expr(expr, exp.SortArray, asc=exp.Boolean(this=asc))
63
+
64
+
65
+ def to_char(expr: exp.Expression, to_format=None, nls_param=None) -> exp.Expression:
66
+ if to_format:
67
+ return _apply_func_expr(
68
+ expr, exp.ToChar, format=exp.Literal(this=to_format, is_string=True), nls_param=nls_param
69
+ )
70
+ return _apply_func_expr(expr, exp.ToChar)
71
+
72
+
73
+ def array_to_string(
74
+ expr: exp.Expression,
75
+ delimiter: str = ",",
76
+ is_string=True,
77
+ null_replacement: str | None = None,
78
+ is_null_replace=True,
79
+ ) -> exp.Expression:
80
+ if null_replacement:
81
+ return _apply_func_expr(
82
+ expr,
83
+ exp.ArrayToString,
84
+ expression=[exp.Literal(this=delimiter, is_string=is_string)],
85
+ null=exp.Literal(this=null_replacement, is_string=is_null_replace),
86
+ )
87
+ return _apply_func_expr(expr, exp.ArrayToString, expression=[exp.Literal(this=delimiter, is_string=is_string)])
88
+
89
+
90
+ def array_sort(expr: exp.Expression, asc=True) -> exp.Expression:
91
+ return _apply_func_expr(expr, exp.ArraySort, expression=exp.Boolean(this=asc))
92
+
93
+
94
+ def anonymous(expr: exp.Column, func: str, is_expr: bool = False, dialect=None) -> exp.Expression:
95
+ """
96
+
97
+ This function used in cases where the sql functions are not available in sqlGlot expressions
98
+ Example:
99
+ >>> from sqlglot import parse_one
100
+ >>> print(repr(parse_one('select unix_timestamp(col1)')))
101
+
102
+ the above code gives you a Select Expression of Anonymous function.
103
+
104
+ To achieve the same,we can use the function as below:
105
+ eg:
106
+ >>> expr = parse_one("select col1 from dual")
107
+ >>> transformed_expr=anonymous(expr,"unix_timestamp({})")
108
+ >>> print(transformed_expr)
109
+ 'SELECT UNIX_TIMESTAMP(col1) FROM DUAL'
110
+
111
+ """
112
+ if is_expr:
113
+ if dialect:
114
+ return exp.Column(this=func.format(expr.sql(dialect=dialect)))
115
+ return exp.Column(this=func.format(expr))
116
+ is_terminal = isinstance(expr, exp.Column)
117
+ new_expr = expr.copy()
118
+ for node in new_expr.dfs():
119
+ if isinstance(node, exp.Column):
120
+ name = f"{node.table}.{node.name}" if node.table else node.name
121
+ anonymous_func = exp.Column(this=func.format(name))
122
+ if is_terminal:
123
+ return anonymous_func
124
+ node.replace(anonymous_func)
125
+ return new_expr
126
+
127
+
128
+ def build_column(this: exp.ExpOrStr, table_name="", quoted=False, alias=None) -> exp.Expression:
129
+ if alias:
130
+ if isinstance(this, str):
131
+ return exp.Alias(
132
+ this=exp.Column(this=this, table=table_name), alias=exp.Identifier(this=alias, quoted=quoted)
133
+ )
134
+ return exp.Alias(this=this, alias=exp.Identifier(this=alias, quoted=quoted))
135
+ return exp.Column(this=exp.Identifier(this=this, quoted=quoted), table=table_name)
136
+
137
+
138
+ def build_literal(this: exp.ExpOrStr, alias=None, quoted=False, is_string=True, cast=None) -> exp.Expression:
139
+ base_literal = exp.Literal(this=this, is_string=is_string)
140
+ if not cast and not alias:
141
+ return base_literal
142
+
143
+ cast_expr = exp.Cast(this=base_literal, to=exp.DataType(this=cast)) if cast else base_literal
144
+ return exp.Alias(this=cast_expr, alias=exp.Identifier(this=alias, quoted=quoted)) if alias else cast_expr
145
+
146
+
147
+ def transform_expression(
148
+ expr: exp.Expression,
149
+ funcs: list[Callable[[exp.Expression], exp.Expression]],
150
+ ) -> exp.Expression:
151
+ for func in funcs:
152
+ expr = func(expr)
153
+ assert isinstance(expr, exp.Expression), (
154
+ f"Func returned an instance of type [{type(expr)}], " "should have been Expression."
155
+ )
156
+ return expr
157
+
158
+
159
+ def get_hash_transform(
160
+ source: Dialect,
161
+ layer: str,
162
+ ):
163
+ dialect_algo = Dialect_hash_algo_mapping.get(source)
164
+ if not dialect_algo:
165
+ raise ValueError(f"Source {source} is not supported. Please add it to Dialect_hash_algo_mapping dictionary.")
166
+
167
+ layer_algo = getattr(dialect_algo, layer, None)
168
+ if not layer_algo:
169
+ raise ValueError(
170
+ f"Layer {layer} is not supported for source {source}. Please add it to Dialect_hash_algo_mapping dictionary."
171
+ )
172
+ return [layer_algo]
173
+
174
+
175
+ def build_from_clause(table_name: str, table_alias: str | None = None) -> exp.From:
176
+ return exp.From(this=exp.Table(this=exp.Identifier(this=table_name), alias=table_alias))
177
+
178
+
179
+ def build_join_clause(
180
+ table_name: str,
181
+ join_columns: list,
182
+ source_table_alias: str | None = None,
183
+ target_table_alias: str | None = None,
184
+ kind: str = "inner",
185
+ func: Callable = exp.NullSafeEQ,
186
+ ) -> exp.Join:
187
+ join_conditions = []
188
+ for column in join_columns:
189
+ join_condition = func(
190
+ this=exp.Column(this=column, table=source_table_alias),
191
+ expression=exp.Column(this=column, table=target_table_alias),
192
+ )
193
+ join_conditions.append(join_condition)
194
+
195
+ # Combine all join conditions with AND
196
+ on_condition: exp.NullSafeEQ | exp.And = join_conditions[0]
197
+ for condition in join_conditions[1:]:
198
+ on_condition = exp.And(this=on_condition, expression=condition)
199
+
200
+ return exp.Join(
201
+ this=exp.Table(this=exp.Identifier(this=table_name), alias=target_table_alias), kind=kind, on=on_condition
202
+ )
203
+
204
+
205
+ def build_sub(
206
+ left_column_name: str,
207
+ right_column_name: str,
208
+ left_table_name: str | None = None,
209
+ right_table_name: str | None = None,
210
+ ) -> exp.Sub:
211
+ return exp.Sub(
212
+ this=build_column(left_column_name, left_table_name),
213
+ expression=build_column(right_column_name, right_table_name),
214
+ )
215
+
216
+
217
+ def build_where_clause(where_clause: list[exp.Expression], condition_type: str = "or") -> exp.Expression:
218
+ func = exp.Or if condition_type == "or" else exp.And
219
+ # Start with a default
220
+ combined_expression: exp.Expression = exp.Paren(this=func(this='1 = 1', expression='1 = 1'))
221
+
222
+ # Loop through the expressions and combine them with OR
223
+ for expression in where_clause:
224
+ combined_expression = func(this=combined_expression, expression=expression)
225
+
226
+ return combined_expression
227
+
228
+
229
+ def build_if(this: exp.Expression, true: exp.Expression, false: exp.Expression | None = None) -> exp.If:
230
+ return exp.If(this=this, true=true, false=false)
231
+
232
+
233
+ def build_between(this: exp.Expression, low: exp.Expression, high: exp.Expression) -> exp.Between:
234
+ return exp.Between(this=this, low=low, high=high)
235
+
236
+
237
+ def _get_is_string(column_types_dict: dict[str, DataType], column_name: str) -> bool:
238
+ if isinstance(column_types_dict.get(column_name), NumericType):
239
+ return False
240
+ return True
241
+
242
+
243
+ DataType_transform_mapping: dict[str, dict[str, list[partial[exp.Expression]]]] = {
244
+ "universal": {"default": [partial(coalesce, default='_null_recon_', is_string=True), partial(trim)]},
245
+ "snowflake": {exp.DataType.Type.ARRAY.value: [partial(array_to_string), partial(array_sort)]},
246
+ "oracle": {
247
+ exp.DataType.Type.NCHAR.value: [
248
+ partial(anonymous, func="NVL(TRIM(TO_CHAR({})),'_null_recon_')", dialect=get_dialect("oracle"))
249
+ ],
250
+ exp.DataType.Type.NVARCHAR.value: [
251
+ partial(anonymous, func="NVL(TRIM(TO_CHAR({})),'_null_recon_')", dialect=get_dialect("oracle"))
252
+ ],
253
+ },
254
+ "databricks": {
255
+ exp.DataType.Type.ARRAY.value: [
256
+ partial(anonymous, func="CONCAT_WS(',', SORT_ARRAY({}))", dialect=get_dialect("databricks"))
257
+ ],
258
+ },
259
+ "tsql": {
260
+ "default": [partial(anonymous, func="COALESCE(LTRIM(RTRIM(CAST([{}] AS VARCHAR(256)))), '_null_recon_')")],
261
+ exp.DataType.Type.DATE.value: [partial(anonymous, func="COALESCE(CONVERT(DATE, {0}, 101), '1900-01-01')")],
262
+ exp.DataType.Type.TIME.value: [partial(anonymous, func="COALESCE(CONVERT(TIME, {0}, 108), '00:00:00')")],
263
+ exp.DataType.Type.DATETIME.value: [
264
+ partial(anonymous, func="COALESCE(CONVERT(DATETIME, {0}, 120), '1900-01-01 00:00:00')")
265
+ ],
266
+ },
267
+ }
268
+
269
+ sha256_partial = partial(sha2, num_bits="256", is_expr=True)
270
+ md5_partial = partial(md5, is_expr=True)
271
+ Dialect_hash_algo_mapping: dict[Dialect, HashAlgoMapping] = {
272
+ get_dialect("snowflake"): HashAlgoMapping(
273
+ source=sha256_partial,
274
+ target=sha256_partial,
275
+ ),
276
+ get_dialect("oracle"): HashAlgoMapping(
277
+ source=partial(
278
+ anonymous, func="DBMS_CRYPTO.HASH(RAWTOHEX({}), 2)", is_expr=True, dialect=get_dialect("oracle")
279
+ ),
280
+ target=md5_partial,
281
+ ),
282
+ get_dialect("databricks"): HashAlgoMapping(
283
+ source=sha256_partial,
284
+ target=sha256_partial,
285
+ ),
286
+ get_dialect("tsql"): HashAlgoMapping(
287
+ source=partial(
288
+ anonymous, func="CONVERT(VARCHAR(256), HASHBYTES('SHA2_256', CONVERT(VARCHAR(256),{})), 2)", is_expr=True
289
+ ),
290
+ target=sha256_partial,
291
+ ),
292
+ }
@@ -0,0 +1,91 @@
1
+ import logging
2
+
3
+ from functools import reduce
4
+ import sqlglot.expressions as exp
5
+ from sqlglot import Dialect
6
+
7
+ from databricks.labs.lakebridge.reconcile.query_builder.base import QueryBuilder
8
+ from databricks.labs.lakebridge.reconcile.query_builder.expression_generator import (
9
+ build_column,
10
+ concat,
11
+ get_hash_transform,
12
+ lower,
13
+ transform_expression,
14
+ )
15
+ from databricks.labs.lakebridge.transpiler.sqlglot.dialect_utils import get_dialect
16
+
17
+ logger = logging.getLogger(__name__)
18
+
19
+
20
+ def _hash_transform(
21
+ node: exp.Expression,
22
+ source: Dialect,
23
+ layer: str,
24
+ ):
25
+ transform = get_hash_transform(source, layer)
26
+ return transform_expression(node, transform)
27
+
28
+
29
+ _HASH_COLUMN_NAME = "hash_value_recon"
30
+
31
+
32
+ class HashQueryBuilder(QueryBuilder):
33
+
34
+ def build_query(self, report_type: str) -> str:
35
+
36
+ if report_type != 'row':
37
+ self._validate(self.join_columns, f"Join Columns are compulsory for {report_type} type")
38
+
39
+ _join_columns = self.join_columns if self.join_columns else set()
40
+ hash_cols = sorted((_join_columns | self.select_columns) - self.threshold_columns - self.drop_columns)
41
+
42
+ key_cols = hash_cols if report_type == "row" else sorted(_join_columns | self.partition_column)
43
+
44
+ cols_with_alias = [
45
+ build_column(this=col, alias=self.table_conf.get_layer_tgt_to_src_col_mapping(col, self.layer))
46
+ for col in key_cols
47
+ ]
48
+
49
+ # in case if we have column mapping, we need to sort the target columns in the order of source columns to get
50
+ # same hash value
51
+ hash_cols_with_alias = [
52
+ {"this": col, "alias": self.table_conf.get_layer_tgt_to_src_col_mapping(col, self.layer)}
53
+ for col in hash_cols
54
+ ]
55
+ sorted_hash_cols_with_alias = sorted(hash_cols_with_alias, key=lambda column: column["alias"])
56
+ hashcols_sorted_as_src_seq = [column["this"] for column in sorted_hash_cols_with_alias]
57
+
58
+ key_cols_with_transform = (
59
+ self._apply_user_transformation(cols_with_alias) if self.user_transformations else cols_with_alias
60
+ )
61
+ hash_col_with_transform = [self._generate_hash_algorithm(hashcols_sorted_as_src_seq, _HASH_COLUMN_NAME)]
62
+
63
+ dialect = self.engine if self.layer == "source" else get_dialect("databricks")
64
+ res = (
65
+ exp.select(*hash_col_with_transform + key_cols_with_transform)
66
+ .from_(":tbl")
67
+ .where(self.filter)
68
+ .sql(dialect=dialect)
69
+ )
70
+
71
+ logger.info(f"Hash Query for {self.layer}: {res}")
72
+ return res
73
+
74
+ def _generate_hash_algorithm(
75
+ self,
76
+ cols: list[str],
77
+ column_alias: str,
78
+ ) -> exp.Expression:
79
+ cols_with_alias = [build_column(this=col, alias=None) for col in cols]
80
+ cols_with_transform = self.add_transformations(
81
+ cols_with_alias, self.engine if self.layer == "source" else get_dialect("databricks")
82
+ )
83
+ col_exprs = exp.select(*cols_with_transform).iter_expressions()
84
+ concat_expr = concat(list(col_exprs))
85
+
86
+ if self.engine == "oracle":
87
+ concat_expr = reduce(lambda x, y: exp.DPipe(this=x, expression=y), concat_expr.expressions)
88
+
89
+ hash_expr = concat_expr.transform(_hash_transform, self.engine, self.layer).transform(lower, is_expr=True)
90
+
91
+ return build_column(hash_expr, alias=column_alias)
@@ -0,0 +1,123 @@
1
+ import logging
2
+
3
+ import sqlglot.expressions as exp
4
+ from pyspark.sql import DataFrame
5
+ from sqlglot import select
6
+
7
+ from databricks.labs.lakebridge.transpiler.sqlglot.dialect_utils import get_key_from_dialect
8
+ from databricks.labs.lakebridge.reconcile.query_builder.base import QueryBuilder
9
+ from databricks.labs.lakebridge.reconcile.query_builder.expression_generator import (
10
+ build_column,
11
+ build_literal,
12
+ _get_is_string,
13
+ build_join_clause,
14
+ )
15
+
16
+ logger = logging.getLogger(__name__)
17
+
18
+
19
+ def _union_concat(
20
+ unions: list[exp.Select],
21
+ result: exp.Union | exp.Select,
22
+ cnt=0,
23
+ ) -> exp.Select | exp.Union:
24
+ if len(unions) == 1:
25
+ return result
26
+ if cnt == len(unions) - 2:
27
+ return exp.union(result, unions[cnt + 1])
28
+ cnt = cnt + 1
29
+ res = exp.union(result, unions[cnt])
30
+ return _union_concat(unions, res, cnt)
31
+
32
+
33
+ class SamplingQueryBuilder(QueryBuilder):
34
+ def build_query_with_alias(self):
35
+ self._validate(self.join_columns, "Join Columns are compulsory for sampling query")
36
+ join_columns = self.join_columns if self.join_columns else set()
37
+
38
+ cols = sorted((join_columns | self.select_columns) - self.threshold_columns - self.drop_columns)
39
+
40
+ cols_with_alias = [
41
+ build_column(this=col, alias=self.table_conf.get_layer_tgt_to_src_col_mapping(col, self.layer))
42
+ for col in cols
43
+ ]
44
+
45
+ query = select(*cols_with_alias).from_(":tbl").where(self.filter).sql(dialect=self.engine)
46
+
47
+ logger.info(f"Sampling Query with Alias for {self.layer}: {query}")
48
+ return query
49
+
50
+ def build_query(self, df: DataFrame):
51
+ self._validate(self.join_columns, "Join Columns are compulsory for sampling query")
52
+ join_columns = self.join_columns if self.join_columns else set()
53
+ if self.layer == "source":
54
+ key_cols = sorted(join_columns)
55
+ else:
56
+ key_cols = sorted(self.table_conf.get_tgt_to_src_col_mapping_list(join_columns))
57
+ keys_df = df.select(*key_cols)
58
+ with_clause = self._get_with_clause(keys_df)
59
+
60
+ cols = sorted((join_columns | self.select_columns) - self.threshold_columns - self.drop_columns)
61
+
62
+ cols_with_alias = [
63
+ build_column(this=col, alias=self.table_conf.get_layer_tgt_to_src_col_mapping(col, self.layer))
64
+ for col in cols
65
+ ]
66
+
67
+ sql_with_transforms = self.add_transformations(cols_with_alias, self.engine)
68
+ query_sql = select(*sql_with_transforms).from_(":tbl").where(self.filter)
69
+ if self.layer == "source":
70
+ with_select = [build_column(this=col, table_name="src") for col in sorted(cols)]
71
+ else:
72
+ with_select = [
73
+ build_column(this=col, table_name="src")
74
+ for col in sorted(self.table_conf.get_tgt_to_src_col_mapping_list(cols))
75
+ ]
76
+
77
+ join_clause = SamplingQueryBuilder._get_join_clause(key_cols)
78
+
79
+ query = (
80
+ with_clause.with_(alias="src", as_=query_sql)
81
+ .select(*with_select)
82
+ .from_("src")
83
+ .join(join_clause)
84
+ .sql(dialect=self.engine)
85
+ )
86
+ logger.info(f"Sampling Query for {self.layer}: {query}")
87
+ return query
88
+
89
+ @classmethod
90
+ def _get_join_clause(cls, key_cols: list):
91
+ return build_join_clause(
92
+ "recon", key_cols, source_table_alias="src", target_table_alias="recon", kind="inner", func=exp.EQ
93
+ )
94
+
95
+ def _get_with_clause(self, df: DataFrame) -> exp.Select:
96
+ union_res = []
97
+ for row in df.collect():
98
+ column_types = [(str(f.name).lower(), f.dataType) for f in df.schema.fields]
99
+ column_types_dict = dict(column_types)
100
+ orig_types_dict = {
101
+ schema.column_name: schema.data_type
102
+ for schema in self.schema
103
+ if schema.column_name not in self.user_transformations
104
+ }
105
+ row_select = [
106
+ (
107
+ build_literal(
108
+ this=str(value),
109
+ alias=col,
110
+ is_string=_get_is_string(column_types_dict, col),
111
+ cast=orig_types_dict.get(col),
112
+ )
113
+ if value is not None
114
+ else exp.Alias(this=exp.Null(), alias=col)
115
+ )
116
+ for col, value in zip(df.columns, row)
117
+ ]
118
+ if get_key_from_dialect(self.engine) == "oracle":
119
+ union_res.append(select(*row_select).from_("dual"))
120
+ else:
121
+ union_res.append(select(*row_select))
122
+ union_statements = _union_concat(union_res, union_res[0], 0)
123
+ return exp.Select().with_(alias='recon', as_=union_statements)