databricks-labs-lakebridge 0.10.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (171) hide show
  1. databricks/__init__.py +3 -0
  2. databricks/labs/__init__.py +3 -0
  3. databricks/labs/lakebridge/__about__.py +2 -0
  4. databricks/labs/lakebridge/__init__.py +11 -0
  5. databricks/labs/lakebridge/assessments/configure_assessment.py +194 -0
  6. databricks/labs/lakebridge/assessments/pipeline.py +188 -0
  7. databricks/labs/lakebridge/assessments/profiler_config.py +30 -0
  8. databricks/labs/lakebridge/base_install.py +12 -0
  9. databricks/labs/lakebridge/cli.py +449 -0
  10. databricks/labs/lakebridge/config.py +192 -0
  11. databricks/labs/lakebridge/connections/__init__.py +0 -0
  12. databricks/labs/lakebridge/connections/credential_manager.py +89 -0
  13. databricks/labs/lakebridge/connections/database_manager.py +98 -0
  14. databricks/labs/lakebridge/connections/env_getter.py +13 -0
  15. databricks/labs/lakebridge/contexts/__init__.py +0 -0
  16. databricks/labs/lakebridge/contexts/application.py +133 -0
  17. databricks/labs/lakebridge/coverage/__init__.py +0 -0
  18. databricks/labs/lakebridge/coverage/commons.py +223 -0
  19. databricks/labs/lakebridge/coverage/lakebridge_snow_transpilation_coverage.py +29 -0
  20. databricks/labs/lakebridge/coverage/local_report.py +9 -0
  21. databricks/labs/lakebridge/coverage/sqlglot_snow_transpilation_coverage.py +5 -0
  22. databricks/labs/lakebridge/coverage/sqlglot_tsql_transpilation_coverage.py +5 -0
  23. databricks/labs/lakebridge/deployment/__init__.py +0 -0
  24. databricks/labs/lakebridge/deployment/configurator.py +199 -0
  25. databricks/labs/lakebridge/deployment/dashboard.py +140 -0
  26. databricks/labs/lakebridge/deployment/installation.py +125 -0
  27. databricks/labs/lakebridge/deployment/job.py +147 -0
  28. databricks/labs/lakebridge/deployment/recon.py +145 -0
  29. databricks/labs/lakebridge/deployment/table.py +30 -0
  30. databricks/labs/lakebridge/deployment/upgrade_common.py +124 -0
  31. databricks/labs/lakebridge/discovery/table.py +36 -0
  32. databricks/labs/lakebridge/discovery/table_definition.py +23 -0
  33. databricks/labs/lakebridge/discovery/tsql_table_definition.py +185 -0
  34. databricks/labs/lakebridge/errors/exceptions.py +1 -0
  35. databricks/labs/lakebridge/helpers/__init__.py +0 -0
  36. databricks/labs/lakebridge/helpers/db_sql.py +24 -0
  37. databricks/labs/lakebridge/helpers/execution_time.py +20 -0
  38. databricks/labs/lakebridge/helpers/file_utils.py +64 -0
  39. databricks/labs/lakebridge/helpers/metastore.py +164 -0
  40. databricks/labs/lakebridge/helpers/recon_config_utils.py +176 -0
  41. databricks/labs/lakebridge/helpers/string_utils.py +62 -0
  42. databricks/labs/lakebridge/helpers/telemetry_utils.py +13 -0
  43. databricks/labs/lakebridge/helpers/validation.py +101 -0
  44. databricks/labs/lakebridge/install.py +849 -0
  45. databricks/labs/lakebridge/intermediate/__init__.py +0 -0
  46. databricks/labs/lakebridge/intermediate/dag.py +88 -0
  47. databricks/labs/lakebridge/intermediate/engine_adapter.py +0 -0
  48. databricks/labs/lakebridge/intermediate/root_tables.py +44 -0
  49. databricks/labs/lakebridge/jvmproxy.py +56 -0
  50. databricks/labs/lakebridge/lineage.py +42 -0
  51. databricks/labs/lakebridge/reconcile/__init__.py +0 -0
  52. databricks/labs/lakebridge/reconcile/compare.py +414 -0
  53. databricks/labs/lakebridge/reconcile/connectors/__init__.py +0 -0
  54. databricks/labs/lakebridge/reconcile/connectors/data_source.py +72 -0
  55. databricks/labs/lakebridge/reconcile/connectors/databricks.py +87 -0
  56. databricks/labs/lakebridge/reconcile/connectors/jdbc_reader.py +41 -0
  57. databricks/labs/lakebridge/reconcile/connectors/oracle.py +108 -0
  58. databricks/labs/lakebridge/reconcile/connectors/secrets.py +30 -0
  59. databricks/labs/lakebridge/reconcile/connectors/snowflake.py +173 -0
  60. databricks/labs/lakebridge/reconcile/connectors/source_adapter.py +30 -0
  61. databricks/labs/lakebridge/reconcile/connectors/sql_server.py +132 -0
  62. databricks/labs/lakebridge/reconcile/constants.py +37 -0
  63. databricks/labs/lakebridge/reconcile/exception.py +42 -0
  64. databricks/labs/lakebridge/reconcile/execute.py +920 -0
  65. databricks/labs/lakebridge/reconcile/query_builder/__init__.py +0 -0
  66. databricks/labs/lakebridge/reconcile/query_builder/aggregate_query.py +293 -0
  67. databricks/labs/lakebridge/reconcile/query_builder/base.py +138 -0
  68. databricks/labs/lakebridge/reconcile/query_builder/count_query.py +33 -0
  69. databricks/labs/lakebridge/reconcile/query_builder/expression_generator.py +292 -0
  70. databricks/labs/lakebridge/reconcile/query_builder/hash_query.py +91 -0
  71. databricks/labs/lakebridge/reconcile/query_builder/sampling_query.py +123 -0
  72. databricks/labs/lakebridge/reconcile/query_builder/threshold_query.py +231 -0
  73. databricks/labs/lakebridge/reconcile/recon_capture.py +635 -0
  74. databricks/labs/lakebridge/reconcile/recon_config.py +363 -0
  75. databricks/labs/lakebridge/reconcile/recon_output_config.py +85 -0
  76. databricks/labs/lakebridge/reconcile/runner.py +97 -0
  77. databricks/labs/lakebridge/reconcile/sampler.py +239 -0
  78. databricks/labs/lakebridge/reconcile/schema_compare.py +126 -0
  79. databricks/labs/lakebridge/resources/__init__.py +0 -0
  80. databricks/labs/lakebridge/resources/config/credentials.yml +33 -0
  81. databricks/labs/lakebridge/resources/reconcile/__init__.py +0 -0
  82. databricks/labs/lakebridge/resources/reconcile/dashboards/__init__.py +0 -0
  83. databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/00_0_aggregate_recon_header.md +6 -0
  84. databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/01_0_recon_id.filter.yml +6 -0
  85. databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/01_1_executed_by.filter.yml +5 -0
  86. databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/01_2_started_at.filter.yml +5 -0
  87. databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/02_0_source_type.filter.yml +5 -0
  88. databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/02_1_source_table.filter.yml +5 -0
  89. databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/02_2_target_table.filter.yml +5 -0
  90. databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/04_0_aggregate_summary_table.sql +46 -0
  91. databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/05_0_aggregate_recon_drilldown_header.md +2 -0
  92. databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/06_0_recon_id.filter.yml +5 -0
  93. databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/06_1_category.filter.yml +5 -0
  94. databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/06_2_aggregate_type.filter.yml +5 -0
  95. databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/07_0_target_table.filter.yml +4 -0
  96. databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/07_1_source_table.filter.yml +4 -0
  97. databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/08_0_aggregate_details_table.sql +92 -0
  98. databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/09_0_aggregate_missing_mismatch_header.md +1 -0
  99. databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/10_0_aggr_mismatched_records.sql +19 -0
  100. databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/11_0_aggr_missing_in_databricks.sql +19 -0
  101. databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/11_1_aggr_missing_in_source.sql +19 -0
  102. databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/dashboard.yml +365 -0
  103. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/00_0_recon_main.md +3 -0
  104. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/01_0_recon_id.filter.yml +6 -0
  105. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/01_1_report_type.filter.yml +5 -0
  106. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/01_2_executed_by.filter.yml +5 -0
  107. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/02_0_source_type.filter.yml +5 -0
  108. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/02_1_source_table.filter.yml +6 -0
  109. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/02_2_target_table.filter.yml +6 -0
  110. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/03_0_started_at.filter.yml +5 -0
  111. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/05_0_summary_table.sql +38 -0
  112. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/06_0_schema_comparison_header.md +3 -0
  113. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/07_0_schema_details_table.sql +42 -0
  114. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/08_0_drill_down_header.md +3 -0
  115. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/09_0_recon_id.filter.yml +4 -0
  116. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/09_1_category.filter.yml +4 -0
  117. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/10_0_target_table.filter.yml +4 -0
  118. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/10_1_source_table.filter.yml +4 -0
  119. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/11_0_recon_details_pivot.sql +40 -0
  120. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/12_0_daily_data_validation_issue_header.md +3 -0
  121. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/13_0_success_fail_.filter.yml +4 -0
  122. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/14_0_failed_recon_ids.sql +15 -0
  123. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/15_0_total_failed_runs.sql +10 -0
  124. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/15_1_failed_targets.sql +10 -0
  125. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/15_2_successful_targets.sql +10 -0
  126. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/16_0_missing_mismatch_header.md +1 -0
  127. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/17_0_mismatched_records.sql +14 -0
  128. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/17_1_threshold_mismatches.sql +14 -0
  129. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/18_0_missing_in_databricks.sql +14 -0
  130. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/18_1_missing_in_source.sql +14 -0
  131. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/dashboard.yml +545 -0
  132. databricks/labs/lakebridge/resources/reconcile/queries/__init__.py +0 -0
  133. databricks/labs/lakebridge/resources/reconcile/queries/installation/__init__.py +0 -0
  134. databricks/labs/lakebridge/resources/reconcile/queries/installation/aggregate_details.sql +7 -0
  135. databricks/labs/lakebridge/resources/reconcile/queries/installation/aggregate_metrics.sql +15 -0
  136. databricks/labs/lakebridge/resources/reconcile/queries/installation/aggregate_rules.sql +6 -0
  137. databricks/labs/lakebridge/resources/reconcile/queries/installation/details.sql +7 -0
  138. databricks/labs/lakebridge/resources/reconcile/queries/installation/main.sql +24 -0
  139. databricks/labs/lakebridge/resources/reconcile/queries/installation/metrics.sql +21 -0
  140. databricks/labs/lakebridge/transpiler/__init__.py +0 -0
  141. databricks/labs/lakebridge/transpiler/execute.py +423 -0
  142. databricks/labs/lakebridge/transpiler/lsp/__init__.py +0 -0
  143. databricks/labs/lakebridge/transpiler/lsp/lsp_engine.py +564 -0
  144. databricks/labs/lakebridge/transpiler/sqlglot/__init__.py +0 -0
  145. databricks/labs/lakebridge/transpiler/sqlglot/dialect_utils.py +30 -0
  146. databricks/labs/lakebridge/transpiler/sqlglot/generator/__init__.py +0 -0
  147. databricks/labs/lakebridge/transpiler/sqlglot/generator/databricks.py +771 -0
  148. databricks/labs/lakebridge/transpiler/sqlglot/lca_utils.py +138 -0
  149. databricks/labs/lakebridge/transpiler/sqlglot/local_expression.py +197 -0
  150. databricks/labs/lakebridge/transpiler/sqlglot/parsers/__init__.py +0 -0
  151. databricks/labs/lakebridge/transpiler/sqlglot/parsers/oracle.py +23 -0
  152. databricks/labs/lakebridge/transpiler/sqlglot/parsers/presto.py +202 -0
  153. databricks/labs/lakebridge/transpiler/sqlglot/parsers/snowflake.py +535 -0
  154. databricks/labs/lakebridge/transpiler/sqlglot/sqlglot_engine.py +203 -0
  155. databricks/labs/lakebridge/transpiler/transpile_engine.py +49 -0
  156. databricks/labs/lakebridge/transpiler/transpile_status.py +68 -0
  157. databricks/labs/lakebridge/uninstall.py +28 -0
  158. databricks/labs/lakebridge/upgrades/v0.4.0_add_main_table_operation_name_column.py +80 -0
  159. databricks/labs/lakebridge/upgrades/v0.6.0_alter_metrics_datatype.py +51 -0
  160. databricks_labs_lakebridge-0.10.0.dist-info/METADATA +58 -0
  161. databricks_labs_lakebridge-0.10.0.dist-info/RECORD +171 -0
  162. databricks_labs_lakebridge-0.10.0.dist-info/WHEEL +4 -0
  163. databricks_labs_lakebridge-0.10.0.dist-info/entry_points.txt +2 -0
  164. databricks_labs_lakebridge-0.10.0.dist-info/licenses/LICENSE +69 -0
  165. databricks_labs_lakebridge-0.10.0.dist-info/licenses/NOTICE +42 -0
  166. docs/lakebridge/src/components/Button.tsx +81 -0
  167. docs/lakebridge/src/css/custom.css +167 -0
  168. docs/lakebridge/src/css/table.css +20 -0
  169. docs/lakebridge/src/pages/index.tsx +57 -0
  170. docs/lakebridge/src/theme/Footer/index.tsx +24 -0
  171. docs/lakebridge/src/theme/Layout/index.tsx +18 -0
@@ -0,0 +1,535 @@
1
+ import logging
2
+ import re
3
+
4
+ from sqlglot import expressions as exp
5
+ from sqlglot.dialects.dialect import build_date_delta as parse_date_delta, build_formatted_time
6
+ from sqlglot.dialects.snowflake import Snowflake as SqlglotSnowflake
7
+ from sqlglot.errors import TokenError, ParseError
8
+ from sqlglot.helper import is_int, seq_get
9
+ from sqlglot.optimizer.simplify import simplify_literals
10
+ from sqlglot.parser import build_var_map as parse_var_map
11
+ from sqlglot.tokens import Token, TokenType
12
+ from sqlglot.trie import new_trie
13
+
14
+ from databricks.labs.lakebridge.transpiler.sqlglot import local_expression
15
+
16
+ logger = logging.getLogger(__name__)
17
+ # pylint: disable=protected-access
18
+ """ SF Supported Date and Time Parts:
19
+ https://docs.snowflake.com/en/sql-reference/functions-date-time#label-supported-date-time-parts
20
+ Covers DATEADD, DATEDIFF, DATE_TRUNC, LAST_DAY
21
+ """
22
+ DATE_DELTA_INTERVAL = {
23
+ "years": "year",
24
+ "year": "year",
25
+ "yrs": "year",
26
+ "yr": "year",
27
+ "yyyy": "year",
28
+ "yyy": "year",
29
+ "yy": "year",
30
+ "y": "year",
31
+ "quarters": "quarter",
32
+ "quarter": "quarter",
33
+ "qtrs": "quarter",
34
+ "qtr": "quarter",
35
+ "q": "quarter",
36
+ "months": "month",
37
+ "month": "month",
38
+ "mons": "month",
39
+ "mon": "month",
40
+ "mm": "month",
41
+ "weekofyear": "week",
42
+ "week": "week",
43
+ "woy": "week",
44
+ "wy": "week",
45
+ "wk": "week",
46
+ "w": "week",
47
+ "dayofmonth": "day",
48
+ "days": "day",
49
+ "day": "day",
50
+ "dd": "day",
51
+ "d": "day",
52
+ }
53
+
54
+ rank_functions = (
55
+ local_expression.CumeDist,
56
+ exp.FirstValue,
57
+ exp.LastValue,
58
+ local_expression.NthValue,
59
+ local_expression.Ntile,
60
+ )
61
+
62
+
63
+ def _parse_to_timestamp(args: list) -> exp.StrToTime | exp.UnixToTime | exp.TimeStrToTime:
64
+ if len(args) == 2:
65
+ first_arg, second_arg = args
66
+ if second_arg.is_string:
67
+ # case: <string_expr> [ , <format> ]
68
+ return build_formatted_time(exp.StrToTime, "snowflake", default=True)(args)
69
+ return exp.UnixToTime(this=first_arg, scale=second_arg)
70
+
71
+ # The first argument might be an expression like 40 * 365 * 86400, so we try to
72
+ # reduce it using `simplify_literals` first and then check if it's a Literal.
73
+ first_arg = seq_get(args, 0)
74
+ if not isinstance(simplify_literals(first_arg, root=True), exp.Literal):
75
+ # case: <variant_expr> or other expressions such as columns
76
+ return exp.TimeStrToTime.from_arg_list(args)
77
+
78
+ if first_arg.is_string:
79
+ if is_int(first_arg.this):
80
+ # case: <integer>
81
+ return exp.UnixToTime.from_arg_list(args)
82
+
83
+ # case: <date_expr>
84
+ return build_formatted_time(exp.StrToTime, "snowflake", default=True)(args)
85
+
86
+ # case: <numeric_expr>
87
+ return exp.UnixToTime.from_arg_list(args)
88
+
89
+
90
+ def _parse_date_add(args: list) -> exp.DateAdd:
91
+ return exp.DateAdd(this=seq_get(args, 2), expression=seq_get(args, 1), unit=seq_get(args, 0))
92
+
93
+
94
+ def _parse_split_part(args: list) -> local_expression.SplitPart:
95
+ if len(args) != 3:
96
+ err_msg = f"Error Parsing args `{args}`. Number of args must be 3, given {len(args)}"
97
+ raise ParseError(err_msg)
98
+ part_num_literal = seq_get(args, 2)
99
+ part_num_if = None
100
+ if isinstance(part_num_literal, exp.Literal):
101
+ # In Snowflake if the partNumber is 0, it is treated as 1.
102
+ # Please refer to https://docs.snowflake.com/en/sql-reference/functions/split_part
103
+ if part_num_literal.is_int and int(part_num_literal.name) == 0:
104
+ part_num_literal = exp.Literal.number(1)
105
+ else:
106
+ cond = exp.EQ(this=part_num_literal, expression=exp.Literal.number(0))
107
+ part_num_if = exp.If(this=cond, true=exp.Literal.number(1), false=part_num_literal)
108
+
109
+ part_num = part_num_if if part_num_if is not None else part_num_literal
110
+ return local_expression.SplitPart(this=seq_get(args, 0), expression=seq_get(args, 1), partNum=part_num)
111
+
112
+
113
+ def _div0_to_if(args: list) -> exp.If:
114
+ cond = exp.EQ(this=seq_get(args, 1), expression=exp.Literal.number(0))
115
+ true = exp.Literal.number(0)
116
+ false = exp.Div(this=seq_get(args, 0), expression=seq_get(args, 1))
117
+ return exp.If(this=cond, true=true, false=false)
118
+
119
+
120
+ def _div0null_to_if(args: list) -> exp.If:
121
+ cond = exp.Or(
122
+ this=exp.EQ(this=seq_get(args, 1), expression=exp.Literal.number(0)),
123
+ expression=exp.Is(this=seq_get(args, 1), expression=exp.Null()),
124
+ )
125
+ true = exp.Literal.number(0)
126
+ false = exp.Div(this=seq_get(args, 0), expression=seq_get(args, 1))
127
+ return exp.If(this=cond, true=true, false=false)
128
+
129
+
130
+ def _parse_json_extract_path_text(args: list) -> local_expression.JsonExtractPathText:
131
+ if len(args) != 2:
132
+ err_message = f"Error Parsing args `{args}`. Number of args must be 2, given {len(args)}"
133
+ raise ParseError(err_message)
134
+ return local_expression.JsonExtractPathText(this=seq_get(args, 0), path_name=seq_get(args, 1))
135
+
136
+
137
+ def _parse_array_contains(args: list) -> exp.ArrayContains:
138
+ if len(args) != 2:
139
+ err_message = f"Error Parsing args `{args}`. Number of args must be 2, given {len(args)}"
140
+ raise ParseError(err_message)
141
+ return exp.ArrayContains(this=seq_get(args, 1), expression=seq_get(args, 0))
142
+
143
+
144
+ def _parse_dayname(args: list) -> local_expression.DateFormat:
145
+ """
146
+ * E, EE, EEE, returns short day name (Mon)
147
+ * EEEE, returns full day name (Monday)
148
+ :param args: node expression
149
+ :return: DateFormat with `E` format
150
+ """
151
+ if len(args) != 1:
152
+ err_message = f"Error Parsing args `{args}`. Number of args must be 1, given {len(args)}"
153
+ raise ParseError(err_message)
154
+ return local_expression.DateFormat(this=seq_get(args, 0), expression=exp.Literal.string("E"))
155
+
156
+
157
+ def _parse_trytonumber(args: list) -> local_expression.TryToNumber:
158
+ if len(args) == 1:
159
+ msg = f"""*Warning:: Parsing args `{args}`:
160
+ * `format` is missing
161
+ * assuming defaults `precision`[38] and `scale`[0]
162
+ """
163
+ logger.warning(msg)
164
+ elif len(args) == 3:
165
+ msg = f"""Error Parsing args `{args}`:
166
+ * `format` is required
167
+ * `precision` and `scale` both are required [if specified]
168
+ """
169
+ raise ParseError(msg)
170
+
171
+ if len(args) == 4:
172
+ return local_expression.TryToNumber(
173
+ this=seq_get(args, 0), expression=seq_get(args, 1), precision=seq_get(args, 2), scale=seq_get(args, 3)
174
+ )
175
+
176
+ return local_expression.TryToNumber(this=seq_get(args, 0), expression=seq_get(args, 1))
177
+
178
+
179
+ def _parse_monthname(args: list) -> local_expression.DateFormat:
180
+ if len(args) != 1:
181
+ err_message = f"Error Parsing args `{args}`. Number of args must be 1, given {len(args)}"
182
+ raise ParseError(err_message)
183
+ return local_expression.DateFormat(this=seq_get(args, 0), expression=exp.Literal.string("MMM"))
184
+
185
+
186
+ def _parse_object_construct(args: list) -> exp.StarMap | exp.Struct:
187
+ expression = parse_var_map(args)
188
+
189
+ if isinstance(expression, exp.StarMap):
190
+ return exp.Struct(expressions=[expression.this])
191
+
192
+ return exp.Struct(
193
+ expressions=[
194
+ exp.PropertyEQ(this=k.this, expression=v) for k, v in zip(expression.keys, expression.values, strict=False)
195
+ ]
196
+ )
197
+
198
+
199
+ def _parse_to_boolean(args: list, *, error=False) -> local_expression.ToBoolean:
200
+ this_arg = seq_get(args, 0)
201
+ return local_expression.ToBoolean(this=this_arg, raise_error=exp.Literal.number(1 if error else 0))
202
+
203
+
204
+ def _parse_tonumber(args: list) -> local_expression.ToNumber:
205
+ if len(args) > 4:
206
+ error_msg = f"""Error Parsing args args:
207
+ * Number of args cannot be more than `4`, given `{len(args)}`
208
+ """
209
+ raise ParseError(error_msg)
210
+
211
+ match len(args):
212
+ case 1:
213
+ msg = (
214
+ "Precision and Scale are not specified, assuming defaults `precision`[38] and `scale`[0]. "
215
+ "If Format is not specified, it will be inferred as simple cast as decimal"
216
+ )
217
+ logger.warning(msg)
218
+ return local_expression.ToNumber(this=seq_get(args, 0))
219
+ case 3:
220
+ msg = "If Format is not specified, it will be inferred as simple cast as decimal"
221
+ logger.warning(msg)
222
+ return local_expression.ToNumber(this=seq_get(args, 0), precision=seq_get(args, 1), scale=seq_get(args, 2))
223
+ case 4:
224
+ return local_expression.ToNumber(
225
+ this=seq_get(args, 0), expression=seq_get(args, 1), precision=seq_get(args, 2), scale=seq_get(args, 3)
226
+ )
227
+
228
+ return local_expression.ToNumber(this=seq_get(args, 0), expression=seq_get(args, 1))
229
+
230
+
231
+ def contains_expression(expr, target_type):
232
+ if isinstance(expr, target_type):
233
+ return True
234
+ if hasattr(expr, 'this') and contains_expression(expr.this, target_type):
235
+ return True
236
+ if hasattr(expr, 'expressions'):
237
+ for sub_expr in expr.expressions:
238
+ if contains_expression(sub_expr, target_type):
239
+ return True
240
+ return False
241
+
242
+
243
+ def _parse_sha2(args: list) -> exp.SHA2:
244
+ if len(args) == 1:
245
+ return exp.SHA2(this=seq_get(args, 0), length=exp.Literal.number(256))
246
+ return exp.SHA2(this=seq_get(args, 0), length=seq_get(args, 1))
247
+
248
+
249
+ def _parse_last_day(args: list) -> exp.LastDay | exp.DateSub:
250
+ if len(args) == 1:
251
+ return exp.LastDay.from_arg_list(args)
252
+
253
+ date_part = seq_get(args, 1)
254
+ if isinstance(date_part, exp.Literal):
255
+ date_part = DATE_DELTA_INTERVAL.get(date_part.this.lower(), None)
256
+ elif isinstance(date_part, exp.Column):
257
+ date_part = DATE_DELTA_INTERVAL.get(date_part.name.lower(), None)
258
+
259
+ if date_part is None or date_part.lower() not in ('year', 'quarter', 'month', 'week'):
260
+ raise ParseError(f'Invalid date part {date_part} for last_day')
261
+
262
+ date_trunc_expr = local_expression.DateTrunc(this=seq_get(args, 0), unit=exp.Literal.string(date_part))
263
+ # Add one date part
264
+ date_add_expr = parse_date_delta(exp.DateAdd, unit_mapping=DATE_DELTA_INTERVAL)(
265
+ [exp.Literal.string(date_part), exp.Literal.number(1), date_trunc_expr]
266
+ )
267
+
268
+ # Subtract one day
269
+ date_sub_expr = parse_date_delta(exp.DateSub, unit_mapping=DATE_DELTA_INTERVAL)(
270
+ [exp.Literal.string('DAY'), exp.Literal.number(1), date_add_expr]
271
+ )
272
+
273
+ return date_sub_expr
274
+
275
+
276
+ class Snowflake(SqlglotSnowflake):
277
+ # Instantiate Snowflake Dialect
278
+ snowflake = SqlglotSnowflake()
279
+
280
+ class Tokenizer(SqlglotSnowflake.Tokenizer):
281
+
282
+ COMMENTS = ["--", "//", ("/*", "*/")]
283
+ STRING_ESCAPES = ["\\", "'"]
284
+
285
+ CUSTOM_TOKEN_MAP = {
286
+ r"(?i)CREATE\s+OR\s+REPLACE\s+PROCEDURE": TokenType.PROCEDURE,
287
+ r"(?i)var\s+\w+\s+=\s+\w+?": TokenType.VAR,
288
+ }
289
+
290
+ SINGLE_TOKENS = {
291
+ **SqlglotSnowflake.Tokenizer.SINGLE_TOKENS,
292
+ "&": TokenType.PARAMETER, # https://docs.snowflake.com/en/user-guide/snowsql-use#substituting-variables-in-a-session
293
+ "!": TokenType.COMMAND,
294
+ }
295
+
296
+ KEYWORDS = {**SqlglotSnowflake.Tokenizer.KEYWORDS}
297
+ # DEC is not a reserved keyword in Snowflake it can be used as table alias
298
+ KEYWORDS.pop("DEC")
299
+
300
+ @classmethod
301
+ def update_keywords(cls, new_key_word_dict):
302
+ cls.KEYWORDS = new_key_word_dict | cls.KEYWORDS
303
+
304
+ @classmethod
305
+ def merge_trie(cls, parent_trie, curr_trie):
306
+ merged_trie = {}
307
+ logger.debug(f"The Parent Trie is {parent_trie}")
308
+ logger.debug(f"The Input Trie is {curr_trie}")
309
+ for key in set(parent_trie.keys()) | set(curr_trie.keys()): # Get all unique keys from both tries
310
+ if key in parent_trie and key in curr_trie: # If the key is in both tries, merge the subtries
311
+ if isinstance(parent_trie[key], dict) and isinstance(curr_trie[key], dict):
312
+ logger.debug(f"New trie inside the key is {curr_trie}")
313
+ logger.debug(f"Parent trie inside the key is {parent_trie}")
314
+ merged_trie[key] = cls.merge_trie(parent_trie[key], curr_trie[key])
315
+ logger.debug(f"Merged Trie is {merged_trie}")
316
+ elif isinstance(parent_trie[key], dict):
317
+ merged_trie[key] = parent_trie[key]
318
+ else:
319
+ merged_trie[key] = curr_trie[key]
320
+ elif key in parent_trie: # If the key is only in trie1, add it to the merged trie
321
+ merged_trie[key] = parent_trie[key]
322
+ else: # If the key is only in trie2, add it to the merged trie
323
+ merged_trie[key] = curr_trie[key]
324
+ return merged_trie
325
+
326
+ @classmethod
327
+ def update_keyword_trie(
328
+ cls,
329
+ curr_trie,
330
+ parent_trie=None,
331
+ ):
332
+ if parent_trie is None:
333
+ parent_trie = cls._KEYWORD_TRIE
334
+ cls.KEYWORD_TRIE = cls.merge_trie(parent_trie, curr_trie)
335
+
336
+ def match_strings_token_dict(self, string, pattern_dict):
337
+ result_dict = {}
338
+ for pattern in pattern_dict:
339
+ matches = re.finditer(pattern, string, re.MULTILINE | re.IGNORECASE | re.DOTALL)
340
+ for _, match in enumerate(matches, start=1):
341
+ result_dict[match.group().upper()] = pattern_dict[pattern]
342
+ return result_dict
343
+
344
+ def match_strings_list(self, string, pattern_dict):
345
+ result = []
346
+ for pattern in pattern_dict:
347
+ matches = re.finditer(pattern, string, re.MULTILINE | re.IGNORECASE | re.DOTALL)
348
+ for _, match in enumerate(matches, start=1):
349
+ result.append(match.group().upper())
350
+ return result
351
+
352
+ def tokenize(self, sql: str) -> list[Token]:
353
+ """Returns a list of tokens corresponding to the SQL string `sql`."""
354
+ self.reset()
355
+ self.sql = sql
356
+ # Update Keywords
357
+ ref_dict = self.match_strings_token_dict(sql, self.CUSTOM_TOKEN_MAP)
358
+ self.update_keywords(ref_dict)
359
+ # Update Keyword Trie
360
+ custom_trie = new_trie(self.match_strings_list(sql, self.CUSTOM_TOKEN_MAP))
361
+ logger.debug(
362
+ f"The New Trie after adding the REF, VAR and IF ELSE blocks "
363
+ f"based on {self.CUSTOM_TOKEN_MAP}, is \n\n {custom_trie}"
364
+ )
365
+ self.update_keyword_trie(custom_trie)
366
+ logger.debug(f"Updated New Trie is {custom_trie}")
367
+ # Parent Code
368
+ self.size = len(sql)
369
+ try:
370
+ self._scan()
371
+ except Exception as e:
372
+ start = self._current - 50
373
+ end = self._current + 50
374
+ start = start if start > 0 else 0
375
+ end = end if end < self.size else self.size - 1
376
+ context = self.sql[start:end]
377
+ msg = f"Error tokenizing '{context}'"
378
+ raise TokenError(msg) from e
379
+ return self.tokens
380
+
381
+ class Parser(SqlglotSnowflake.Parser):
382
+ FUNCTIONS = {
383
+ **SqlglotSnowflake.Parser.FUNCTIONS,
384
+ "ARRAY_AGG": exp.ArrayAgg.from_arg_list,
385
+ "STRTOK_TO_ARRAY": local_expression.Split.from_arg_list,
386
+ "DATE_FROM_PARTS": local_expression.MakeDate.from_arg_list,
387
+ "CONVERT_TIMEZONE": local_expression.ConvertTimeZone.from_arg_list,
388
+ "TRY_TO_DATE": local_expression.TryToDate.from_arg_list,
389
+ "TRY_TO_TIMESTAMP": local_expression.TryToTimestamp.from_arg_list,
390
+ "STRTOK": local_expression.StrTok.from_arg_list,
391
+ "SPLIT_PART": _parse_split_part,
392
+ "TIMESTAMPADD": _parse_date_add,
393
+ "TRY_TO_DECIMAL": _parse_trytonumber,
394
+ "TRY_TO_NUMBER": _parse_trytonumber,
395
+ "TRY_TO_NUMERIC": _parse_trytonumber,
396
+ "DATEADD": parse_date_delta(exp.DateAdd, unit_mapping=DATE_DELTA_INTERVAL),
397
+ "DATEDIFF": parse_date_delta(exp.DateDiff, unit_mapping=DATE_DELTA_INTERVAL),
398
+ "IS_INTEGER": local_expression.IsInteger.from_arg_list,
399
+ "DIV0": _div0_to_if,
400
+ "DIV0NULL": _div0null_to_if,
401
+ "JSON_EXTRACT_PATH_TEXT": _parse_json_extract_path_text,
402
+ "BITOR_AGG": local_expression.BitOr.from_arg_list,
403
+ "ARRAY_CONTAINS": _parse_array_contains,
404
+ "DAYNAME": _parse_dayname,
405
+ "BASE64_ENCODE": exp.ToBase64.from_arg_list,
406
+ "BASE64_DECODE_STRING": exp.FromBase64.from_arg_list,
407
+ "TRY_BASE64_DECODE_STRING": exp.FromBase64.from_arg_list,
408
+ "ARRAY_CONSTRUCT_COMPACT": local_expression.ArrayConstructCompact.from_arg_list,
409
+ "ARRAY_INTERSECTION": local_expression.ArrayIntersection.from_arg_list,
410
+ "ARRAY_SLICE": local_expression.ArraySlice.from_arg_list,
411
+ "MONTHNAME": _parse_monthname,
412
+ "MONTH_NAME": _parse_monthname,
413
+ "OBJECT_CONSTRUCT": _parse_object_construct,
414
+ "OBJECT_KEYS": local_expression.ObjectKeys.from_arg_list,
415
+ "TRY_PARSE_JSON": exp.ParseJSON.from_arg_list,
416
+ "TIMEDIFF": parse_date_delta(exp.DateDiff, unit_mapping=DATE_DELTA_INTERVAL),
417
+ "TIMESTAMPDIFF": parse_date_delta(exp.DateDiff, unit_mapping=DATE_DELTA_INTERVAL),
418
+ "TIMEADD": _parse_date_add,
419
+ "TO_BOOLEAN": lambda args: _parse_to_boolean(args, error=True),
420
+ "TO_DECIMAL": _parse_tonumber,
421
+ "TO_DOUBLE": local_expression.ToDouble.from_arg_list,
422
+ "TO_NUMBER": _parse_tonumber,
423
+ "TO_NUMERIC": _parse_tonumber,
424
+ "TO_OBJECT": local_expression.ToObject.from_arg_list,
425
+ "TO_TIME": _parse_to_timestamp,
426
+ "TIMESTAMP_FROM_PARTS": local_expression.TimestampFromParts.from_arg_list,
427
+ "TO_VARIANT": local_expression.ToVariant.from_arg_list,
428
+ "TRY_TO_BOOLEAN": lambda args: _parse_to_boolean(args, error=False),
429
+ "UUID_STRING": local_expression.UUID.from_arg_list,
430
+ "SYSDATE": exp.CurrentTimestamp.from_arg_list,
431
+ "TRUNC": lambda args: local_expression.DateTrunc(unit=seq_get(args, 1), this=seq_get(args, 0)),
432
+ "APPROX_PERCENTILE": exp.ApproxQuantile.from_arg_list,
433
+ "NTH_VALUE": local_expression.NthValue.from_arg_list,
434
+ "MEDIAN": local_expression.Median.from_arg_list,
435
+ "CUME_DIST": local_expression.CumeDist.from_arg_list,
436
+ "DENSE_RANK": local_expression.DenseRank.from_arg_list,
437
+ "RANK": local_expression.Rank.from_arg_list,
438
+ "PERCENT_RANK": local_expression.PercentRank.from_arg_list,
439
+ "NTILE": local_expression.Ntile.from_arg_list,
440
+ "TO_ARRAY": local_expression.ToArray.from_arg_list,
441
+ "SHA2": _parse_sha2,
442
+ "LAST_DAY": _parse_last_day,
443
+ "ARRAY_FLATTEN": exp.Flatten.from_arg_list,
444
+ }
445
+
446
+ FUNCTION_PARSERS = {
447
+ **SqlglotSnowflake.Parser.FUNCTION_PARSERS,
448
+ "LISTAGG": lambda self: self._parse_list_agg(),
449
+ }
450
+
451
+ PLACEHOLDER_PARSERS = {
452
+ **SqlglotSnowflake.Parser.PLACEHOLDER_PARSERS,
453
+ TokenType.PARAMETER: lambda self: self._parse_parameter(),
454
+ }
455
+
456
+ FUNC_TOKENS = {*SqlglotSnowflake.Parser.FUNC_TOKENS, TokenType.COLLATE}
457
+
458
+ COLUMN_OPERATORS = {
459
+ **SqlglotSnowflake.Parser.COLUMN_OPERATORS,
460
+ }
461
+
462
+ TIMESTAMPS: set[TokenType] = SqlglotSnowflake.Parser.TIMESTAMPS.copy() - {TokenType.TIME}
463
+
464
+ RANGE_PARSERS = {
465
+ **SqlglotSnowflake.Parser.RANGE_PARSERS,
466
+ }
467
+
468
+ ALTER_PARSERS = {**SqlglotSnowflake.Parser.ALTER_PARSERS}
469
+
470
+ def _parse_list_agg(self) -> exp.GroupConcat:
471
+ if self._match(TokenType.DISTINCT):
472
+ args: list[exp.Expression] = [self.expression(exp.Distinct, expressions=[self._parse_conjunction()])]
473
+ if self._match(TokenType.COMMA):
474
+ args.extend(self._parse_csv(self._parse_conjunction))
475
+ else:
476
+ args = self._parse_csv(self._parse_conjunction)
477
+
478
+ return self.expression(exp.GroupConcat, this=args[0], separator=seq_get(args, 1))
479
+
480
+ def _parse_types(
481
+ self, check_func: bool = False, schema: bool = False, allow_identifiers: bool = True
482
+ ) -> exp.Expression | None:
483
+ this = super()._parse_types(check_func=check_func, schema=schema, allow_identifiers=allow_identifiers)
484
+ # https://docs.snowflake.com/en/sql-reference/data-types-numeric Numeric datatype alias
485
+ if (
486
+ isinstance(this, exp.DataType)
487
+ and this.is_type("numeric", "decimal", "number", "integer", "int", "smallint", "bigint")
488
+ and not this.expressions
489
+ ):
490
+ return exp.DataType.build("DECIMAL(38,0)")
491
+ return this
492
+
493
+ def _parse_parameter(self):
494
+ wrapped = self._match(TokenType.L_BRACE)
495
+ this = self._parse_var() or self._parse_identifier() or self._parse_primary()
496
+ self._match(TokenType.R_BRACE)
497
+ suffix: exp.Expression | None = None
498
+ if not self._match(TokenType.SPACE) or self._match(TokenType.DOT):
499
+ suffix = self._parse_var() or self._parse_identifier() or self._parse_primary()
500
+
501
+ return self.expression(local_expression.Parameter, this=this, wrapped=wrapped, suffix=suffix)
502
+
503
+ def _parse_window(self, this: exp.Expression | None, alias: bool = False) -> exp.Expression | None:
504
+ window = super()._parse_window(this=this, alias=alias)
505
+ # Adding default window frame for the rank-related functions in snowflake
506
+ if window and contains_expression(window.this, rank_functions) and window.args.get('spec') is None:
507
+ window.args['spec'] = self.expression(
508
+ exp.WindowSpec,
509
+ kind="ROWS",
510
+ start="UNBOUNDED",
511
+ start_side="PRECEDING",
512
+ end="UNBOUNDED",
513
+ end_side="FOLLOWING",
514
+ )
515
+ return window
516
+
517
+ def _parse_alter_table_add(self) -> list[exp.Expression]:
518
+ index = self._index - 1
519
+ if self._match_set(self.ADD_CONSTRAINT_TOKENS, advance=False):
520
+ return self._parse_csv(
521
+ lambda: self.expression(exp.AddConstraint, expressions=self._parse_csv(self._parse_constraint))
522
+ )
523
+
524
+ self._retreat(index)
525
+ if not self.ALTER_TABLE_ADD_REQUIRED_FOR_EACH_COLUMN and self._match_text_seq("ADD"):
526
+ return self._parse_wrapped_csv(self._parse_field_def, optional=True)
527
+
528
+ if self._match_text_seq("ADD", "COLUMN"):
529
+ schema = self._parse_schema()
530
+ if schema:
531
+ return [schema]
532
+ # return self._parse_csv in case of COLUMNS are not enclosed in brackets ()
533
+ return self._parse_csv(self._parse_field_def)
534
+
535
+ return self._parse_wrapped_csv(self._parse_add_column, optional=True)