databricks-labs-lakebridge 0.10.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (171) hide show
  1. databricks/__init__.py +3 -0
  2. databricks/labs/__init__.py +3 -0
  3. databricks/labs/lakebridge/__about__.py +2 -0
  4. databricks/labs/lakebridge/__init__.py +11 -0
  5. databricks/labs/lakebridge/assessments/configure_assessment.py +194 -0
  6. databricks/labs/lakebridge/assessments/pipeline.py +188 -0
  7. databricks/labs/lakebridge/assessments/profiler_config.py +30 -0
  8. databricks/labs/lakebridge/base_install.py +12 -0
  9. databricks/labs/lakebridge/cli.py +449 -0
  10. databricks/labs/lakebridge/config.py +192 -0
  11. databricks/labs/lakebridge/connections/__init__.py +0 -0
  12. databricks/labs/lakebridge/connections/credential_manager.py +89 -0
  13. databricks/labs/lakebridge/connections/database_manager.py +98 -0
  14. databricks/labs/lakebridge/connections/env_getter.py +13 -0
  15. databricks/labs/lakebridge/contexts/__init__.py +0 -0
  16. databricks/labs/lakebridge/contexts/application.py +133 -0
  17. databricks/labs/lakebridge/coverage/__init__.py +0 -0
  18. databricks/labs/lakebridge/coverage/commons.py +223 -0
  19. databricks/labs/lakebridge/coverage/lakebridge_snow_transpilation_coverage.py +29 -0
  20. databricks/labs/lakebridge/coverage/local_report.py +9 -0
  21. databricks/labs/lakebridge/coverage/sqlglot_snow_transpilation_coverage.py +5 -0
  22. databricks/labs/lakebridge/coverage/sqlglot_tsql_transpilation_coverage.py +5 -0
  23. databricks/labs/lakebridge/deployment/__init__.py +0 -0
  24. databricks/labs/lakebridge/deployment/configurator.py +199 -0
  25. databricks/labs/lakebridge/deployment/dashboard.py +140 -0
  26. databricks/labs/lakebridge/deployment/installation.py +125 -0
  27. databricks/labs/lakebridge/deployment/job.py +147 -0
  28. databricks/labs/lakebridge/deployment/recon.py +145 -0
  29. databricks/labs/lakebridge/deployment/table.py +30 -0
  30. databricks/labs/lakebridge/deployment/upgrade_common.py +124 -0
  31. databricks/labs/lakebridge/discovery/table.py +36 -0
  32. databricks/labs/lakebridge/discovery/table_definition.py +23 -0
  33. databricks/labs/lakebridge/discovery/tsql_table_definition.py +185 -0
  34. databricks/labs/lakebridge/errors/exceptions.py +1 -0
  35. databricks/labs/lakebridge/helpers/__init__.py +0 -0
  36. databricks/labs/lakebridge/helpers/db_sql.py +24 -0
  37. databricks/labs/lakebridge/helpers/execution_time.py +20 -0
  38. databricks/labs/lakebridge/helpers/file_utils.py +64 -0
  39. databricks/labs/lakebridge/helpers/metastore.py +164 -0
  40. databricks/labs/lakebridge/helpers/recon_config_utils.py +176 -0
  41. databricks/labs/lakebridge/helpers/string_utils.py +62 -0
  42. databricks/labs/lakebridge/helpers/telemetry_utils.py +13 -0
  43. databricks/labs/lakebridge/helpers/validation.py +101 -0
  44. databricks/labs/lakebridge/install.py +849 -0
  45. databricks/labs/lakebridge/intermediate/__init__.py +0 -0
  46. databricks/labs/lakebridge/intermediate/dag.py +88 -0
  47. databricks/labs/lakebridge/intermediate/engine_adapter.py +0 -0
  48. databricks/labs/lakebridge/intermediate/root_tables.py +44 -0
  49. databricks/labs/lakebridge/jvmproxy.py +56 -0
  50. databricks/labs/lakebridge/lineage.py +42 -0
  51. databricks/labs/lakebridge/reconcile/__init__.py +0 -0
  52. databricks/labs/lakebridge/reconcile/compare.py +414 -0
  53. databricks/labs/lakebridge/reconcile/connectors/__init__.py +0 -0
  54. databricks/labs/lakebridge/reconcile/connectors/data_source.py +72 -0
  55. databricks/labs/lakebridge/reconcile/connectors/databricks.py +87 -0
  56. databricks/labs/lakebridge/reconcile/connectors/jdbc_reader.py +41 -0
  57. databricks/labs/lakebridge/reconcile/connectors/oracle.py +108 -0
  58. databricks/labs/lakebridge/reconcile/connectors/secrets.py +30 -0
  59. databricks/labs/lakebridge/reconcile/connectors/snowflake.py +173 -0
  60. databricks/labs/lakebridge/reconcile/connectors/source_adapter.py +30 -0
  61. databricks/labs/lakebridge/reconcile/connectors/sql_server.py +132 -0
  62. databricks/labs/lakebridge/reconcile/constants.py +37 -0
  63. databricks/labs/lakebridge/reconcile/exception.py +42 -0
  64. databricks/labs/lakebridge/reconcile/execute.py +920 -0
  65. databricks/labs/lakebridge/reconcile/query_builder/__init__.py +0 -0
  66. databricks/labs/lakebridge/reconcile/query_builder/aggregate_query.py +293 -0
  67. databricks/labs/lakebridge/reconcile/query_builder/base.py +138 -0
  68. databricks/labs/lakebridge/reconcile/query_builder/count_query.py +33 -0
  69. databricks/labs/lakebridge/reconcile/query_builder/expression_generator.py +292 -0
  70. databricks/labs/lakebridge/reconcile/query_builder/hash_query.py +91 -0
  71. databricks/labs/lakebridge/reconcile/query_builder/sampling_query.py +123 -0
  72. databricks/labs/lakebridge/reconcile/query_builder/threshold_query.py +231 -0
  73. databricks/labs/lakebridge/reconcile/recon_capture.py +635 -0
  74. databricks/labs/lakebridge/reconcile/recon_config.py +363 -0
  75. databricks/labs/lakebridge/reconcile/recon_output_config.py +85 -0
  76. databricks/labs/lakebridge/reconcile/runner.py +97 -0
  77. databricks/labs/lakebridge/reconcile/sampler.py +239 -0
  78. databricks/labs/lakebridge/reconcile/schema_compare.py +126 -0
  79. databricks/labs/lakebridge/resources/__init__.py +0 -0
  80. databricks/labs/lakebridge/resources/config/credentials.yml +33 -0
  81. databricks/labs/lakebridge/resources/reconcile/__init__.py +0 -0
  82. databricks/labs/lakebridge/resources/reconcile/dashboards/__init__.py +0 -0
  83. databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/00_0_aggregate_recon_header.md +6 -0
  84. databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/01_0_recon_id.filter.yml +6 -0
  85. databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/01_1_executed_by.filter.yml +5 -0
  86. databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/01_2_started_at.filter.yml +5 -0
  87. databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/02_0_source_type.filter.yml +5 -0
  88. databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/02_1_source_table.filter.yml +5 -0
  89. databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/02_2_target_table.filter.yml +5 -0
  90. databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/04_0_aggregate_summary_table.sql +46 -0
  91. databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/05_0_aggregate_recon_drilldown_header.md +2 -0
  92. databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/06_0_recon_id.filter.yml +5 -0
  93. databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/06_1_category.filter.yml +5 -0
  94. databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/06_2_aggregate_type.filter.yml +5 -0
  95. databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/07_0_target_table.filter.yml +4 -0
  96. databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/07_1_source_table.filter.yml +4 -0
  97. databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/08_0_aggregate_details_table.sql +92 -0
  98. databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/09_0_aggregate_missing_mismatch_header.md +1 -0
  99. databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/10_0_aggr_mismatched_records.sql +19 -0
  100. databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/11_0_aggr_missing_in_databricks.sql +19 -0
  101. databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/11_1_aggr_missing_in_source.sql +19 -0
  102. databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/dashboard.yml +365 -0
  103. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/00_0_recon_main.md +3 -0
  104. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/01_0_recon_id.filter.yml +6 -0
  105. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/01_1_report_type.filter.yml +5 -0
  106. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/01_2_executed_by.filter.yml +5 -0
  107. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/02_0_source_type.filter.yml +5 -0
  108. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/02_1_source_table.filter.yml +6 -0
  109. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/02_2_target_table.filter.yml +6 -0
  110. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/03_0_started_at.filter.yml +5 -0
  111. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/05_0_summary_table.sql +38 -0
  112. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/06_0_schema_comparison_header.md +3 -0
  113. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/07_0_schema_details_table.sql +42 -0
  114. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/08_0_drill_down_header.md +3 -0
  115. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/09_0_recon_id.filter.yml +4 -0
  116. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/09_1_category.filter.yml +4 -0
  117. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/10_0_target_table.filter.yml +4 -0
  118. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/10_1_source_table.filter.yml +4 -0
  119. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/11_0_recon_details_pivot.sql +40 -0
  120. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/12_0_daily_data_validation_issue_header.md +3 -0
  121. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/13_0_success_fail_.filter.yml +4 -0
  122. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/14_0_failed_recon_ids.sql +15 -0
  123. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/15_0_total_failed_runs.sql +10 -0
  124. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/15_1_failed_targets.sql +10 -0
  125. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/15_2_successful_targets.sql +10 -0
  126. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/16_0_missing_mismatch_header.md +1 -0
  127. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/17_0_mismatched_records.sql +14 -0
  128. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/17_1_threshold_mismatches.sql +14 -0
  129. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/18_0_missing_in_databricks.sql +14 -0
  130. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/18_1_missing_in_source.sql +14 -0
  131. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/dashboard.yml +545 -0
  132. databricks/labs/lakebridge/resources/reconcile/queries/__init__.py +0 -0
  133. databricks/labs/lakebridge/resources/reconcile/queries/installation/__init__.py +0 -0
  134. databricks/labs/lakebridge/resources/reconcile/queries/installation/aggregate_details.sql +7 -0
  135. databricks/labs/lakebridge/resources/reconcile/queries/installation/aggregate_metrics.sql +15 -0
  136. databricks/labs/lakebridge/resources/reconcile/queries/installation/aggregate_rules.sql +6 -0
  137. databricks/labs/lakebridge/resources/reconcile/queries/installation/details.sql +7 -0
  138. databricks/labs/lakebridge/resources/reconcile/queries/installation/main.sql +24 -0
  139. databricks/labs/lakebridge/resources/reconcile/queries/installation/metrics.sql +21 -0
  140. databricks/labs/lakebridge/transpiler/__init__.py +0 -0
  141. databricks/labs/lakebridge/transpiler/execute.py +423 -0
  142. databricks/labs/lakebridge/transpiler/lsp/__init__.py +0 -0
  143. databricks/labs/lakebridge/transpiler/lsp/lsp_engine.py +564 -0
  144. databricks/labs/lakebridge/transpiler/sqlglot/__init__.py +0 -0
  145. databricks/labs/lakebridge/transpiler/sqlglot/dialect_utils.py +30 -0
  146. databricks/labs/lakebridge/transpiler/sqlglot/generator/__init__.py +0 -0
  147. databricks/labs/lakebridge/transpiler/sqlglot/generator/databricks.py +771 -0
  148. databricks/labs/lakebridge/transpiler/sqlglot/lca_utils.py +138 -0
  149. databricks/labs/lakebridge/transpiler/sqlglot/local_expression.py +197 -0
  150. databricks/labs/lakebridge/transpiler/sqlglot/parsers/__init__.py +0 -0
  151. databricks/labs/lakebridge/transpiler/sqlglot/parsers/oracle.py +23 -0
  152. databricks/labs/lakebridge/transpiler/sqlglot/parsers/presto.py +202 -0
  153. databricks/labs/lakebridge/transpiler/sqlglot/parsers/snowflake.py +535 -0
  154. databricks/labs/lakebridge/transpiler/sqlglot/sqlglot_engine.py +203 -0
  155. databricks/labs/lakebridge/transpiler/transpile_engine.py +49 -0
  156. databricks/labs/lakebridge/transpiler/transpile_status.py +68 -0
  157. databricks/labs/lakebridge/uninstall.py +28 -0
  158. databricks/labs/lakebridge/upgrades/v0.4.0_add_main_table_operation_name_column.py +80 -0
  159. databricks/labs/lakebridge/upgrades/v0.6.0_alter_metrics_datatype.py +51 -0
  160. databricks_labs_lakebridge-0.10.0.dist-info/METADATA +58 -0
  161. databricks_labs_lakebridge-0.10.0.dist-info/RECORD +171 -0
  162. databricks_labs_lakebridge-0.10.0.dist-info/WHEEL +4 -0
  163. databricks_labs_lakebridge-0.10.0.dist-info/entry_points.txt +2 -0
  164. databricks_labs_lakebridge-0.10.0.dist-info/licenses/LICENSE +69 -0
  165. databricks_labs_lakebridge-0.10.0.dist-info/licenses/NOTICE +42 -0
  166. docs/lakebridge/src/components/Button.tsx +81 -0
  167. docs/lakebridge/src/css/custom.css +167 -0
  168. docs/lakebridge/src/css/table.css +20 -0
  169. docs/lakebridge/src/pages/index.tsx +57 -0
  170. docs/lakebridge/src/theme/Footer/index.tsx +24 -0
  171. docs/lakebridge/src/theme/Layout/index.tsx +18 -0
@@ -0,0 +1,293 @@
1
+ import logging
2
+ from itertools import groupby
3
+ from operator import attrgetter
4
+
5
+ import sqlglot.expressions as exp
6
+
7
+ from databricks.labs.lakebridge.reconcile.query_builder.base import QueryBuilder
8
+ from databricks.labs.lakebridge.reconcile.query_builder.expression_generator import (
9
+ build_column,
10
+ )
11
+ from databricks.labs.lakebridge.reconcile.recon_config import (
12
+ Aggregate,
13
+ AggregateQueryRules,
14
+ AggregateRule,
15
+ )
16
+
17
+ logger = logging.getLogger(__name__)
18
+
19
+
20
+ def _remove_aliases(node: exp.Expression) -> exp.Expression:
21
+ if isinstance(node, exp.Alias):
22
+ return node.this
23
+ return node
24
+
25
+
26
+ class AggregateQueryBuilder(QueryBuilder):
27
+
28
+ def _get_mapping_col(self, col: str) -> str:
29
+ """
30
+ Get the column mapping for the given column based on the layer
31
+
32
+ Examples:
33
+ Input :: col: "COL1", mapping: "{source: COL1, target: COLUMN1}", layer: "source"
34
+
35
+ Returns -> "COLUMN1"
36
+
37
+ :param col: Column Name
38
+ :return: Mapped Column Name if found, else Column Name
39
+ """
40
+ # apply column mapping, ex: "{source: pid, target: product_id}"
41
+ column_with_mapping = self.table_conf.get_layer_tgt_to_src_col_mapping(col, self.layer)
42
+ if self.layer == "target":
43
+ column_with_mapping = self.table_conf.get_layer_src_to_tgt_col_mapping(col, self.layer)
44
+ return column_with_mapping
45
+
46
+ def _get_mapping_cols_with_alias(self, cols_list: list[str], agg_type: str):
47
+ """
48
+ Creates a Column Expression for each [Mapped] Column with Agg_Type+Original_Column as Alias
49
+
50
+ Examples:
51
+ Input :: cols_list: ["COL1", "COL2"], agg_type: ["MAX"] \n
52
+ Returns -> ["column1 AS max<#>col1", "column2 AS max<#>col2]
53
+
54
+ :param cols_list: List of aggregate columns
55
+ :param agg_type: MIN, MAX, COUNT, AVG
56
+ :return: list[Expression] - List of Column Expressions with Alias
57
+ """
58
+ cols_with_mapping: list[exp.Expression] = []
59
+ for col in cols_list:
60
+ column_expr = build_column(
61
+ this=f"{self._get_mapping_col(col)}", alias=f"{agg_type.lower()}<#>{col.lower()}"
62
+ )
63
+ cols_with_mapping.append(column_expr)
64
+ return cols_with_mapping
65
+
66
+ def _agg_query_cols_with_alias(self, transformed_cols: list[exp.Expression]):
67
+ cols_with_alias = []
68
+
69
+ for transformed_col in transformed_cols:
70
+ # Split the alias defined above as agg_type(min, max etc..), original column (pid)
71
+ agg_type, org_col_name = transformed_col.alias.split("<#>")
72
+
73
+ # Create a new alias with layer, agg_type and original column name,
74
+ # ex: source_min_pid, target_max_product_id
75
+ layer_agg_type_col_alias = f"{self.layer}_{agg_type}_{org_col_name}".lower()
76
+
77
+ # Get the Transformed column name without the alias
78
+ col_name = transformed_col.sql().replace(f"AS {transformed_col.alias}", '').strip()
79
+
80
+ # Create a new Column Expression with the new alias,
81
+ # ex: MIN(pid) AS source_min_pid, MIN(product_id) AS target_min_pid
82
+ column_name = f"{col_name}" if agg_type == "group_by" else f"{agg_type}({col_name})"
83
+ col_with_alias = build_column(this=column_name, alias=layer_agg_type_col_alias)
84
+ cols_with_alias.append(col_with_alias)
85
+
86
+ return cols_with_alias
87
+
88
+ def _get_layer_query(self, group_list: list[Aggregate]) -> AggregateQueryRules:
89
+ """
90
+ Builds the query based on the layer:
91
+ * Creates an Expression using
92
+ - 'select' columns with alias for the aggregate columns
93
+ - 'filters' (where) based on the layer
94
+ - 'group by' if group_by_columns are defined
95
+ * Generates and returns the SQL query using the above Expression and Dialect
96
+ - query Aggregate rules
97
+
98
+ Examples:
99
+ 1.Input :: group_list: [Aggregate(type="Max", agg_cols=["col2", "col3"], group_by_columns=["col1"]),
100
+ Aggregate(type="Sum", agg_cols=["col1", "col2"], group_by_columns=["col1"])]
101
+ Returns -> SELECT max(col2) AS src_max_col2, max(col3) AS src_max_col3,
102
+ sum(col1) AS src_sum_col1, sum(col2) AS src_sum_col2
103
+ FROM :tbl
104
+ WHERE col1 IS NOT NULL
105
+ GROUP BY col1
106
+ 2.
107
+ group_list: [Aggregate(type="avg", agg_cols=["col4"])]
108
+ :layer: "tgt"
109
+ :returns -> SELECT avg(col4) AS tgt_avg_col4 FROM :tbl
110
+
111
+ :param group_list: List of Aggregate objects with same Group by columns
112
+ :return: str - SQL Query
113
+ """
114
+ cols_with_mapping: list[exp.Expression] = []
115
+ # Generates a Single Query for multiple aggregates with the same group_by_columns,
116
+ # refer to Example 1
117
+ query_agg_rules = []
118
+ processed_rules: dict[str, str] = {}
119
+ for agg in group_list:
120
+
121
+ # Skip duplicate rules
122
+ # Example: {min_grp1+__+grp2 : col1+__+col2}, key = min_grp1+__+grp2
123
+ key = f"{agg.type}_{agg.group_by_columns_as_str}"
124
+ if key in processed_rules:
125
+ existing_rule = processed_rules.get(key)
126
+ if existing_rule == agg.agg_columns_as_str:
127
+ logger.info(
128
+ f"Skipping duplicate rule for key: {key}, value: {agg.agg_columns_as_str},"
129
+ f" layer: {self.layer}"
130
+ )
131
+ continue
132
+ processed_rules[key] = agg.agg_columns_as_str
133
+
134
+ # Get the rules for each aggregate and append to the query_agg_rules list
135
+ query_agg_rules.extend(self._build_aggregate_rules(agg))
136
+
137
+ # Get the mapping with alias for aggregate columns and append to the cols_with_mapping list
138
+ cols_with_mapping.extend(self._get_mapping_cols_with_alias(agg.agg_columns, agg.type))
139
+
140
+ # Apply user transformations on Select columns
141
+ # Example: {column_name: creation_date, source: creation_date, target: to_date(creation_date,'yyyy-mm-dd')}
142
+ select_cols_with_transform = (
143
+ self._apply_user_transformation(cols_with_mapping) if self.user_transformations else cols_with_mapping
144
+ )
145
+
146
+ # Transformed columns
147
+ select_cols_with_alias = self._agg_query_cols_with_alias(select_cols_with_transform)
148
+ query_exp = exp.select(*select_cols_with_alias).from_(":tbl").where(self.filter)
149
+
150
+ assert group_list[0], "At least, one item must be present in the group_list."
151
+
152
+ # Apply Group by if group_by_columns are defined
153
+ if group_list[0].group_by_columns:
154
+ group_by_cols_with_mapping = self._get_mapping_cols_with_alias(group_list[0].group_by_columns, "GROUP_BY")
155
+
156
+ # Apply user transformations on group_by_columns,
157
+ # ex: {column_name: creation_date, source: creation_date, target: to_date(creation_date,'yyyy-mm-dd')}
158
+ group_by_cols_with_transform = (
159
+ self._apply_user_transformation(group_by_cols_with_mapping)
160
+ if self.user_transformations
161
+ else group_by_cols_with_mapping
162
+ )
163
+
164
+ select_group_by_cols_with_alias = self._agg_query_cols_with_alias(group_by_cols_with_transform)
165
+
166
+ # Group by column doesn't support alias (GROUP BY to_date(COL1, 'yyyy-MM-dd') AS col1) throws error
167
+ group_by_col_without_alias = [
168
+ build_column(this=_remove_aliases(group_by_col_with_alias).sql())
169
+ for group_by_col_with_alias in select_group_by_cols_with_alias
170
+ if " AS " in group_by_col_with_alias.sql()
171
+ ]
172
+
173
+ query_exp = (
174
+ exp.select(*select_cols_with_alias + select_group_by_cols_with_alias)
175
+ .from_(":tbl")
176
+ .where(self.filter)
177
+ .group_by(*group_by_col_without_alias)
178
+ )
179
+
180
+ agg_query_rules = AggregateQueryRules(
181
+ layer=self.layer,
182
+ group_by_columns=group_list[0].group_by_columns,
183
+ group_by_columns_as_str=group_list[0].group_by_columns_as_str,
184
+ query=query_exp.sql(dialect=self.engine),
185
+ rules=query_agg_rules,
186
+ )
187
+ return agg_query_rules
188
+
189
+ def grouped_aggregates(self):
190
+ """
191
+ Group items based on group_by_columns_keys:
192
+ Example:
193
+ aggregates = [
194
+ Aggregate(type="Min", agg_cols=["c_nation_str", "col2"],
195
+ group_by_columns=["col3"]),
196
+ Aggregate(type="Max", agg_cols=["col2", "col3"], group_by_columns=["col1"]),
197
+ Aggregate(type="avg", agg_cols=["col4"]),
198
+ Aggregate(type="sum", agg_cols=["col3", "col6"], group_by_columns=["col1"]),
199
+ ]
200
+ output:
201
+ * key: NA with index 1
202
+ - Aggregate(agg_cols=['col4'], type='avg', group_by_columns=None, group_by_columns_as_str='NA')
203
+ * key: col1 with index 2
204
+ - Aggregate(agg_cols=['col2', 'col3'], type='Max', group_by_columns=['col1'],
205
+ group_by_columns_as_str='col1')
206
+ - Aggregate(agg_cols=['col3', 'col6'], type='sum', group_by_columns=['col1'],
207
+ group_by_columns_as_str='col1')
208
+ * key: col3 with index 3
209
+ - Aggregate(agg_cols=['c_nation_str', 'col2'], type='Min', group_by_columns=['col3'],
210
+ group_by_columns_as_str='col3')
211
+ """
212
+ _aggregates: list[Aggregate] = []
213
+
214
+ assert self.aggregates, "Aggregates config must be defined to build the queries."
215
+ self._validate(self.aggregates, "Aggregates config must be defined to build the queries.")
216
+
217
+ if self.aggregates:
218
+ _aggregates = self.aggregates
219
+
220
+ # Sort the aggregates based on group_by_columns_as_str
221
+ _aggregates.sort(key=attrgetter("group_by_columns_as_str"))
222
+
223
+ return groupby(_aggregates, key=attrgetter("group_by_columns_as_str"))
224
+
225
+ @classmethod
226
+ def _build_aggregate_rules(cls, agg: Aggregate) -> list[AggregateRule]:
227
+ """
228
+ Builds the rules for each aggregate column in the given Aggregate object
229
+
230
+ Example:
231
+ Input :: Aggregate: {
232
+ "type": "MIN",
233
+ "agg_cols": ["COL1", "COL2"],
234
+ "group_by_columns": ["GRP1", "GRP2]
235
+ }
236
+ Returns -> [AggregateRule(rule_id=hash(min_col1_grp1_grp2)),
237
+ query=SELECT {rule_id} as rule_id,
238
+ 'min' as agg_type,
239
+ 'col1' as agg_column,
240
+ ('grp1', 'grp2') as group_by_columns),
241
+
242
+ AggregateRule(rule_id=hash(min_col2_grp1_grp2)),
243
+ query=SELECT {rule_id} as rule_id,
244
+ 'min' as agg_type,
245
+ 'col2' as agg_column,
246
+ ('grp1', 'grp2') as group_by_columns)]
247
+ :param agg: Aggregate
248
+ :return: list[AggregateRule]
249
+ """
250
+
251
+ return [
252
+ AggregateRule(
253
+ agg_type=agg.type,
254
+ agg_column=agg_col,
255
+ group_by_columns=agg.group_by_columns,
256
+ group_by_columns_as_str=agg.group_by_columns_as_str,
257
+ )
258
+ for agg_col in agg.agg_columns
259
+ ]
260
+
261
+ def build_queries(self) -> list[AggregateQueryRules]:
262
+ """
263
+ Generates the Source and Target Queries for the list of Aggregate objects
264
+ * Group items based on group_by_columns_keys and for each group,
265
+ generates the query_with_rules for both Source and Target Dialects
266
+ * Generates 2 Queries (Source, Target) for each unique group_by_columns_keys
267
+
268
+ Examples:
269
+ 1. [Aggregate(type="avg", agg_cols=["col4"])]
270
+ {
271
+ "src_query_1": "SELECT avg(col4) AS src_avg_col4 FROM :tbl"
272
+ }
273
+ {
274
+ "tgt_query_1": "SELECT avg(col4) AS tgt_avg_col4 FROM :tbl"
275
+ }
276
+ 2. [Aggregate(type="Max", agg_cols=["col3"], group_by_columns=["col1"]),
277
+ Aggregate(type="Sum", agg_cols=["col2"], group_by_columns=["col4"])]
278
+ {
279
+ "src_query_1": "SELECT max(col3) AS src_max_col3 FROM :tbl GROUP BY col1"
280
+ "src_query_2": "SELECT sum(col2) AS src_sum_col2 FROM :tbl GROUP BY col4"
281
+ }
282
+ {
283
+ "tgt_query_1": "SELECT max(col3) AS tgt_max_col3 FROM :tbl GROUP BY col1"
284
+ "tgt_query_2": "SELECT sum(col2) AS tgt_sum_col2 FROM :tbl GROUP BY col4"
285
+ }
286
+ :return: Dictionary with Source and Target Queries
287
+ """
288
+ query_with_rules_list = []
289
+ for key, group in self.grouped_aggregates():
290
+ logger.info(f"Building Query and Rules for key: {key}, layer: {self.layer}")
291
+ query_with_rules_list.append(self._get_layer_query(list(group)))
292
+
293
+ return query_with_rules_list
@@ -0,0 +1,138 @@
1
+ import logging
2
+ from abc import ABC
3
+
4
+ import sqlglot.expressions as exp
5
+ from sqlglot import Dialect, parse_one
6
+
7
+ from databricks.labs.lakebridge.reconcile.exception import InvalidInputException
8
+ from databricks.labs.lakebridge.reconcile.query_builder.expression_generator import (
9
+ DataType_transform_mapping,
10
+ transform_expression,
11
+ )
12
+ from databricks.labs.lakebridge.reconcile.recon_config import Schema, Table, Aggregate
13
+ from databricks.labs.lakebridge.transpiler.sqlglot.dialect_utils import get_dialect, SQLGLOT_DIALECTS
14
+
15
+ logger = logging.getLogger(__name__)
16
+
17
+
18
+ class QueryBuilder(ABC):
19
+ def __init__(
20
+ self,
21
+ table_conf: Table,
22
+ schema: list[Schema],
23
+ layer: str,
24
+ engine: Dialect,
25
+ ):
26
+ self._table_conf = table_conf
27
+ self._schema = schema
28
+ self._layer = layer
29
+ self._engine = engine
30
+
31
+ @property
32
+ def engine(self) -> Dialect:
33
+ return self._engine
34
+
35
+ @property
36
+ def layer(self) -> str:
37
+ return self._layer
38
+
39
+ @property
40
+ def schema(self) -> list[Schema]:
41
+ return self._schema
42
+
43
+ @property
44
+ def table_conf(self) -> Table:
45
+ return self._table_conf
46
+
47
+ @property
48
+ def select_columns(self) -> set[str]:
49
+ return self.table_conf.get_select_columns(self._schema, self._layer)
50
+
51
+ @property
52
+ def threshold_columns(self) -> set[str]:
53
+ return self.table_conf.get_threshold_columns(self._layer)
54
+
55
+ @property
56
+ def join_columns(self) -> set[str] | None:
57
+ return self.table_conf.get_join_columns(self._layer)
58
+
59
+ @property
60
+ def drop_columns(self) -> set[str]:
61
+ return self._table_conf.get_drop_columns(self._layer)
62
+
63
+ @property
64
+ def partition_column(self) -> set[str]:
65
+ return self._table_conf.get_partition_column(self._layer)
66
+
67
+ @property
68
+ def filter(self) -> str | None:
69
+ return self._table_conf.get_filter(self._layer)
70
+
71
+ @property
72
+ def user_transformations(self) -> dict[str, str]:
73
+ return self._table_conf.get_transformation_dict(self._layer)
74
+
75
+ @property
76
+ def aggregates(self) -> list[Aggregate] | None:
77
+ return self.table_conf.aggregates
78
+
79
+ def add_transformations(self, aliases: list[exp.Expression], source: Dialect) -> list[exp.Expression]:
80
+ if self.user_transformations:
81
+ alias_with_user_transforms = self._apply_user_transformation(aliases)
82
+ default_transform_schema: list[Schema] = list(
83
+ filter(lambda sch: sch.column_name not in self.user_transformations.keys(), self.schema)
84
+ )
85
+ return self._apply_default_transformation(alias_with_user_transforms, default_transform_schema, source)
86
+ return self._apply_default_transformation(aliases, self.schema, source)
87
+
88
+ def _apply_user_transformation(self, aliases: list[exp.Expression]) -> list[exp.Expression]:
89
+ with_transform = []
90
+ for alias in aliases:
91
+ with_transform.append(alias.transform(self._user_transformer, self.user_transformations))
92
+ return with_transform
93
+
94
+ def _user_transformer(self, node: exp.Expression, user_transformations: dict[str, str]) -> exp.Expression:
95
+ if isinstance(node, exp.Column) and user_transformations:
96
+ dialect = self.engine if self.layer == "source" else get_dialect("databricks")
97
+ column_name = node.name
98
+ if column_name in user_transformations.keys():
99
+ return parse_one(user_transformations.get(column_name, column_name), read=dialect)
100
+ return node
101
+
102
+ def _apply_default_transformation(
103
+ self, aliases: list[exp.Expression], schema: list[Schema], source: Dialect
104
+ ) -> list[exp.Expression]:
105
+ with_transform = []
106
+ for alias in aliases:
107
+ with_transform.append(alias.transform(self._default_transformer, schema, source))
108
+ return with_transform
109
+
110
+ @staticmethod
111
+ def _default_transformer(node: exp.Expression, schema: list[Schema], source: Dialect) -> exp.Expression:
112
+
113
+ def _get_transform(datatype: str):
114
+ source_dialects = [source_key for source_key, dialect in SQLGLOT_DIALECTS.items() if dialect == source]
115
+ source_dialect = source_dialects[0] if source_dialects else "universal"
116
+
117
+ source_mapping = DataType_transform_mapping.get(source_dialect, {})
118
+
119
+ if source_mapping.get(datatype.upper()) is not None:
120
+ return source_mapping.get(datatype.upper())
121
+ if source_mapping.get("default") is not None:
122
+ return source_mapping.get("default")
123
+
124
+ return DataType_transform_mapping.get("universal", {}).get("default")
125
+
126
+ schema_dict = {v.column_name: v.data_type for v in schema}
127
+ if isinstance(node, exp.Column):
128
+ column_name = node.name
129
+ if column_name in schema_dict.keys():
130
+ transform = _get_transform(schema_dict.get(column_name, column_name))
131
+ return transform_expression(node, transform)
132
+ return node
133
+
134
+ def _validate(self, field: set[str] | list[str] | None, message: str):
135
+ if field is None:
136
+ message = f"Exception for {self.table_conf.target_name} target table in {self.layer} layer --> {message}"
137
+ logger.error(message)
138
+ raise InvalidInputException(message)
@@ -0,0 +1,33 @@
1
+ import logging
2
+
3
+ from sqlglot import Dialect
4
+ from sqlglot import expressions as exp
5
+
6
+ from databricks.labs.lakebridge.reconcile.query_builder.expression_generator import build_column, build_literal
7
+ from databricks.labs.lakebridge.reconcile.recon_config import Table
8
+
9
+ logger = logging.getLogger(__name__)
10
+
11
+
12
+ class CountQueryBuilder:
13
+
14
+ def __init__(
15
+ self,
16
+ table_conf: Table,
17
+ layer: str,
18
+ engine: Dialect,
19
+ ):
20
+ self._table_conf = table_conf
21
+ self._layer = layer
22
+ self._engine = engine
23
+
24
+ def build_query(self):
25
+ select_clause = build_column(this=exp.Count(this=build_literal(this="1", is_string=False)), alias="count")
26
+ count_query = (
27
+ exp.select(select_clause)
28
+ .from_(":tbl")
29
+ .where(self._table_conf.get_filter(self._layer))
30
+ .sql(dialect=self._engine)
31
+ )
32
+ logger.info(f"Record Count Query for {self._layer}: {count_query}")
33
+ return count_query