altimate-code 0.5.2 → 0.5.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (101) hide show
  1. package/CHANGELOG.md +12 -0
  2. package/bin/altimate +6 -0
  3. package/bin/altimate-code +6 -0
  4. package/dbt-tools/bin/altimate-dbt +2 -0
  5. package/dbt-tools/dist/altimate_python_packages/altimate_packages/altimate/__init__.py +0 -0
  6. package/dbt-tools/dist/altimate_python_packages/altimate_packages/altimate/fetch_schema.py +35 -0
  7. package/dbt-tools/dist/altimate_python_packages/altimate_packages/altimate/utils.py +353 -0
  8. package/dbt-tools/dist/altimate_python_packages/altimate_packages/altimate/validate_sql.py +114 -0
  9. package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/__init__.py +178 -0
  10. package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/__main__.py +96 -0
  11. package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/_typing.py +17 -0
  12. package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/dataframe/__init__.py +3 -0
  13. package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/dataframe/sql/__init__.py +18 -0
  14. package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/dataframe/sql/_typing.py +18 -0
  15. package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/dataframe/sql/column.py +332 -0
  16. package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/dataframe/sql/dataframe.py +866 -0
  17. package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/dataframe/sql/functions.py +1267 -0
  18. package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/dataframe/sql/group.py +59 -0
  19. package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/dataframe/sql/normalize.py +78 -0
  20. package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/dataframe/sql/operations.py +53 -0
  21. package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/dataframe/sql/readwriter.py +108 -0
  22. package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/dataframe/sql/session.py +190 -0
  23. package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/dataframe/sql/transforms.py +9 -0
  24. package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/dataframe/sql/types.py +212 -0
  25. package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/dataframe/sql/util.py +32 -0
  26. package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/dataframe/sql/window.py +134 -0
  27. package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/dialects/__init__.py +118 -0
  28. package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/dialects/athena.py +166 -0
  29. package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/dialects/bigquery.py +1331 -0
  30. package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/dialects/clickhouse.py +1393 -0
  31. package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/dialects/databricks.py +131 -0
  32. package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/dialects/dialect.py +1915 -0
  33. package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/dialects/doris.py +561 -0
  34. package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/dialects/drill.py +157 -0
  35. package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/dialects/druid.py +20 -0
  36. package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/dialects/duckdb.py +1159 -0
  37. package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/dialects/dune.py +16 -0
  38. package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/dialects/hive.py +787 -0
  39. package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/dialects/materialize.py +94 -0
  40. package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/dialects/mysql.py +1324 -0
  41. package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/dialects/oracle.py +378 -0
  42. package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/dialects/postgres.py +778 -0
  43. package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/dialects/presto.py +788 -0
  44. package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/dialects/prql.py +203 -0
  45. package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/dialects/redshift.py +448 -0
  46. package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/dialects/risingwave.py +78 -0
  47. package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/dialects/snowflake.py +1464 -0
  48. package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/dialects/spark.py +202 -0
  49. package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/dialects/spark2.py +349 -0
  50. package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/dialects/sqlite.py +320 -0
  51. package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/dialects/starrocks.py +343 -0
  52. package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/dialects/tableau.py +61 -0
  53. package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/dialects/teradata.py +356 -0
  54. package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/dialects/trino.py +115 -0
  55. package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/dialects/tsql.py +1403 -0
  56. package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/diff.py +456 -0
  57. package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/errors.py +93 -0
  58. package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/executor/__init__.py +95 -0
  59. package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/executor/context.py +101 -0
  60. package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/executor/env.py +246 -0
  61. package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/executor/python.py +460 -0
  62. package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/executor/table.py +155 -0
  63. package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/expressions.py +8870 -0
  64. package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/generator.py +4993 -0
  65. package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/helper.py +582 -0
  66. package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/jsonpath.py +227 -0
  67. package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/lineage.py +423 -0
  68. package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/optimizer/__init__.py +11 -0
  69. package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/optimizer/annotate_types.py +589 -0
  70. package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/optimizer/canonicalize.py +222 -0
  71. package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/optimizer/eliminate_ctes.py +43 -0
  72. package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/optimizer/eliminate_joins.py +181 -0
  73. package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/optimizer/eliminate_subqueries.py +189 -0
  74. package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/optimizer/isolate_table_selects.py +50 -0
  75. package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/optimizer/merge_subqueries.py +415 -0
  76. package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/optimizer/normalize.py +200 -0
  77. package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/optimizer/normalize_identifiers.py +64 -0
  78. package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/optimizer/optimize_joins.py +91 -0
  79. package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/optimizer/optimizer.py +94 -0
  80. package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/optimizer/pushdown_predicates.py +222 -0
  81. package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/optimizer/pushdown_projections.py +172 -0
  82. package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/optimizer/qualify.py +104 -0
  83. package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/optimizer/qualify_columns.py +1024 -0
  84. package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/optimizer/qualify_tables.py +155 -0
  85. package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/optimizer/scope.py +904 -0
  86. package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/optimizer/simplify.py +1587 -0
  87. package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/optimizer/unnest_subqueries.py +302 -0
  88. package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/parser.py +8501 -0
  89. package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/planner.py +463 -0
  90. package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/schema.py +588 -0
  91. package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/serde.py +68 -0
  92. package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/time.py +687 -0
  93. package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/tokens.py +1520 -0
  94. package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/transforms.py +1020 -0
  95. package/dbt-tools/dist/altimate_python_packages/altimate_packages/sqlglot/trie.py +81 -0
  96. package/dbt-tools/dist/altimate_python_packages/dbt_core_integration.py +825 -0
  97. package/dbt-tools/dist/altimate_python_packages/dbt_utils.py +157 -0
  98. package/dbt-tools/dist/index.js +23859 -0
  99. package/package.json +13 -13
  100. package/postinstall.mjs +42 -0
  101. package/skills/altimate-setup/SKILL.md +31 -0
@@ -0,0 +1,456 @@
1
+ """
2
+ .. include:: ../posts/sql_diff.md
3
+
4
+ ----
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import typing as t
10
+ from collections import defaultdict
11
+ from dataclasses import dataclass
12
+ from heapq import heappop, heappush
13
+ from itertools import chain
14
+
15
+ from sqlglot import Dialect, expressions as exp
16
+ from sqlglot.helper import seq_get
17
+
18
+ if t.TYPE_CHECKING:
19
+ from sqlglot.dialects.dialect import DialectType
20
+
21
+
22
+ @dataclass(frozen=True)
23
+ class Insert:
24
+ """Indicates that a new node has been inserted"""
25
+
26
+ expression: exp.Expression
27
+
28
+
29
+ @dataclass(frozen=True)
30
+ class Remove:
31
+ """Indicates that an existing node has been removed"""
32
+
33
+ expression: exp.Expression
34
+
35
+
36
+ @dataclass(frozen=True)
37
+ class Move:
38
+ """Indicates that an existing node's position within the tree has changed"""
39
+
40
+ source: exp.Expression
41
+ target: exp.Expression
42
+
43
+
44
+ @dataclass(frozen=True)
45
+ class Update:
46
+ """Indicates that an existing node has been updated"""
47
+
48
+ source: exp.Expression
49
+ target: exp.Expression
50
+
51
+
52
+ @dataclass(frozen=True)
53
+ class Keep:
54
+ """Indicates that an existing node hasn't been changed"""
55
+
56
+ source: exp.Expression
57
+ target: exp.Expression
58
+
59
+
60
+ if t.TYPE_CHECKING:
61
+ from sqlglot._typing import T
62
+
63
+ Edit = t.Union[Insert, Remove, Move, Update, Keep]
64
+
65
+
66
+ def diff(
67
+ source: exp.Expression,
68
+ target: exp.Expression,
69
+ matchings: t.List[t.Tuple[exp.Expression, exp.Expression]] | None = None,
70
+ delta_only: bool = False,
71
+ **kwargs: t.Any,
72
+ ) -> t.List[Edit]:
73
+ """
74
+ Returns the list of changes between the source and the target expressions.
75
+
76
+ Examples:
77
+ >>> diff(parse_one("a + b"), parse_one("a + c"))
78
+ [
79
+ Remove(expression=(COLUMN this: (IDENTIFIER this: b, quoted: False))),
80
+ Insert(expression=(COLUMN this: (IDENTIFIER this: c, quoted: False))),
81
+ Keep(
82
+ source=(ADD this: ...),
83
+ target=(ADD this: ...)
84
+ ),
85
+ Keep(
86
+ source=(COLUMN this: (IDENTIFIER this: a, quoted: False)),
87
+ target=(COLUMN this: (IDENTIFIER this: a, quoted: False))
88
+ ),
89
+ ]
90
+
91
+ Args:
92
+ source: the source expression.
93
+ target: the target expression against which the diff should be calculated.
94
+ matchings: the list of pre-matched node pairs which is used to help the algorithm's
95
+ heuristics produce better results for subtrees that are known by a caller to be matching.
96
+ Note: expression references in this list must refer to the same node objects that are
97
+ referenced in the source / target trees.
98
+ delta_only: excludes all `Keep` nodes from the diff.
99
+ kwargs: additional arguments to pass to the ChangeDistiller instance.
100
+
101
+ Returns:
102
+ the list of Insert, Remove, Move, Update and Keep objects for each node in the source and the
103
+ target expression trees. This list represents a sequence of steps needed to transform the source
104
+ expression tree into the target one.
105
+ """
106
+ matchings = matchings or []
107
+
108
+ def compute_node_mappings(
109
+ old_nodes: tuple[exp.Expression, ...], new_nodes: tuple[exp.Expression, ...]
110
+ ) -> t.Dict[int, exp.Expression]:
111
+ node_mapping = {}
112
+ for old_node, new_node in zip(reversed(old_nodes), reversed(new_nodes)):
113
+ new_node._hash = hash(new_node)
114
+ node_mapping[id(old_node)] = new_node
115
+
116
+ return node_mapping
117
+
118
+ # if the source and target have any shared objects, that means there's an issue with the ast
119
+ # the algorithm won't work because the parent / hierarchies will be inaccurate
120
+ source_nodes = tuple(source.walk())
121
+ target_nodes = tuple(target.walk())
122
+ source_ids = {id(n) for n in source_nodes}
123
+ target_ids = {id(n) for n in target_nodes}
124
+
125
+ copy = (
126
+ len(source_nodes) != len(source_ids)
127
+ or len(target_nodes) != len(target_ids)
128
+ or source_ids & target_ids
129
+ )
130
+
131
+ source_copy = source.copy() if copy else source
132
+ target_copy = target.copy() if copy else target
133
+
134
+ try:
135
+ # We cache the hash of each new node here to speed up equality comparisons. If the input
136
+ # trees aren't copied, these hashes will be evicted before returning the edit script.
137
+ if copy and matchings:
138
+ source_mapping = compute_node_mappings(source_nodes, tuple(source_copy.walk()))
139
+ target_mapping = compute_node_mappings(target_nodes, tuple(target_copy.walk()))
140
+ matchings = [(source_mapping[id(s)], target_mapping[id(t)]) for s, t in matchings]
141
+ else:
142
+ for node in chain(reversed(source_nodes), reversed(target_nodes)):
143
+ node._hash = hash(node)
144
+
145
+ edit_script = ChangeDistiller(**kwargs).diff(
146
+ source_copy,
147
+ target_copy,
148
+ matchings=matchings,
149
+ delta_only=delta_only,
150
+ )
151
+ finally:
152
+ if not copy:
153
+ for node in chain(source_nodes, target_nodes):
154
+ node._hash = None
155
+
156
+ return edit_script
157
+
158
+
159
+ # The expression types for which Update edits are allowed.
160
+ UPDATABLE_EXPRESSION_TYPES = (
161
+ exp.Alias,
162
+ exp.Boolean,
163
+ exp.Column,
164
+ exp.DataType,
165
+ exp.Lambda,
166
+ exp.Literal,
167
+ exp.Table,
168
+ exp.Window,
169
+ )
170
+
171
+ IGNORED_LEAF_EXPRESSION_TYPES = (exp.Identifier,)
172
+
173
+
174
+ class ChangeDistiller:
175
+ """
176
+ The implementation of the Change Distiller algorithm described by Beat Fluri and Martin Pinzger in
177
+ their paper https://ieeexplore.ieee.org/document/4339230, which in turn is based on the algorithm by
178
+ Chawathe et al. described in http://ilpubs.stanford.edu:8090/115/1/1995-46.pdf.
179
+ """
180
+
181
+ def __init__(self, f: float = 0.6, t: float = 0.6, dialect: DialectType = None) -> None:
182
+ self.f = f
183
+ self.t = t
184
+ self._sql_generator = Dialect.get_or_raise(dialect).generator()
185
+
186
+ def diff(
187
+ self,
188
+ source: exp.Expression,
189
+ target: exp.Expression,
190
+ matchings: t.List[t.Tuple[exp.Expression, exp.Expression]] | None = None,
191
+ delta_only: bool = False,
192
+ ) -> t.List[Edit]:
193
+ matchings = matchings or []
194
+ pre_matched_nodes = {id(s): id(t) for s, t in matchings}
195
+
196
+ self._source = source
197
+ self._target = target
198
+ self._source_index = {
199
+ id(n): n for n in self._source.bfs() if not isinstance(n, IGNORED_LEAF_EXPRESSION_TYPES)
200
+ }
201
+ self._target_index = {
202
+ id(n): n for n in self._target.bfs() if not isinstance(n, IGNORED_LEAF_EXPRESSION_TYPES)
203
+ }
204
+ self._unmatched_source_nodes = set(self._source_index) - set(pre_matched_nodes)
205
+ self._unmatched_target_nodes = set(self._target_index) - set(pre_matched_nodes.values())
206
+ self._bigram_histo_cache: t.Dict[int, t.DefaultDict[str, int]] = {}
207
+
208
+ matching_set = self._compute_matching_set() | set(pre_matched_nodes.items())
209
+ return self._generate_edit_script(dict(matching_set), delta_only)
210
+
211
+ def _generate_edit_script(self, matchings: t.Dict[int, int], delta_only: bool) -> t.List[Edit]:
212
+ edit_script: t.List[Edit] = []
213
+ for removed_node_id in self._unmatched_source_nodes:
214
+ edit_script.append(Remove(self._source_index[removed_node_id]))
215
+ for inserted_node_id in self._unmatched_target_nodes:
216
+ edit_script.append(Insert(self._target_index[inserted_node_id]))
217
+ for kept_source_node_id, kept_target_node_id in matchings.items():
218
+ source_node = self._source_index[kept_source_node_id]
219
+ target_node = self._target_index[kept_target_node_id]
220
+
221
+ identical_nodes = source_node == target_node
222
+
223
+ if not isinstance(source_node, UPDATABLE_EXPRESSION_TYPES) or identical_nodes:
224
+ if identical_nodes:
225
+ source_parent = source_node.parent
226
+ target_parent = target_node.parent
227
+
228
+ if (
229
+ (source_parent and not target_parent)
230
+ or (not source_parent and target_parent)
231
+ or (
232
+ source_parent
233
+ and target_parent
234
+ and matchings.get(id(source_parent)) != id(target_parent)
235
+ )
236
+ ):
237
+ edit_script.append(Move(source=source_node, target=target_node))
238
+ else:
239
+ edit_script.extend(
240
+ self._generate_move_edits(source_node, target_node, matchings)
241
+ )
242
+
243
+ source_non_expression_leaves = dict(_get_non_expression_leaves(source_node))
244
+ target_non_expression_leaves = dict(_get_non_expression_leaves(target_node))
245
+
246
+ if source_non_expression_leaves != target_non_expression_leaves:
247
+ edit_script.append(Update(source_node, target_node))
248
+ elif not delta_only:
249
+ edit_script.append(Keep(source_node, target_node))
250
+ else:
251
+ edit_script.append(Update(source_node, target_node))
252
+
253
+ return edit_script
254
+
255
+ def _generate_move_edits(
256
+ self, source: exp.Expression, target: exp.Expression, matchings: t.Dict[int, int]
257
+ ) -> t.List[Move]:
258
+ source_args = [id(e) for e in _expression_only_args(source)]
259
+ target_args = [id(e) for e in _expression_only_args(target)]
260
+
261
+ args_lcs = set(
262
+ _lcs(source_args, target_args, lambda l, r: matchings.get(t.cast(int, l)) == r)
263
+ )
264
+
265
+ move_edits = []
266
+ for a in source_args:
267
+ if a not in args_lcs and a not in self._unmatched_source_nodes:
268
+ move_edits.append(
269
+ Move(source=self._source_index[a], target=self._target_index[matchings[a]])
270
+ )
271
+
272
+ return move_edits
273
+
274
+ def _compute_matching_set(self) -> t.Set[t.Tuple[int, int]]:
275
+ leaves_matching_set = self._compute_leaf_matching_set()
276
+ matching_set = leaves_matching_set.copy()
277
+
278
+ ordered_unmatched_source_nodes = {
279
+ id(n): None for n in self._source.bfs() if id(n) in self._unmatched_source_nodes
280
+ }
281
+ ordered_unmatched_target_nodes = {
282
+ id(n): None for n in self._target.bfs() if id(n) in self._unmatched_target_nodes
283
+ }
284
+
285
+ for source_node_id in ordered_unmatched_source_nodes:
286
+ for target_node_id in ordered_unmatched_target_nodes:
287
+ source_node = self._source_index[source_node_id]
288
+ target_node = self._target_index[target_node_id]
289
+ if _is_same_type(source_node, target_node):
290
+ source_leaf_ids = {id(l) for l in _get_expression_leaves(source_node)}
291
+ target_leaf_ids = {id(l) for l in _get_expression_leaves(target_node)}
292
+
293
+ max_leaves_num = max(len(source_leaf_ids), len(target_leaf_ids))
294
+ if max_leaves_num:
295
+ common_leaves_num = sum(
296
+ 1 if s in source_leaf_ids and t in target_leaf_ids else 0
297
+ for s, t in leaves_matching_set
298
+ )
299
+ leaf_similarity_score = common_leaves_num / max_leaves_num
300
+ else:
301
+ leaf_similarity_score = 0.0
302
+
303
+ adjusted_t = (
304
+ self.t if min(len(source_leaf_ids), len(target_leaf_ids)) > 4 else 0.4
305
+ )
306
+
307
+ if leaf_similarity_score >= 0.8 or (
308
+ leaf_similarity_score >= adjusted_t
309
+ and self._dice_coefficient(source_node, target_node) >= self.f
310
+ ):
311
+ matching_set.add((source_node_id, target_node_id))
312
+ self._unmatched_source_nodes.remove(source_node_id)
313
+ self._unmatched_target_nodes.remove(target_node_id)
314
+ ordered_unmatched_target_nodes.pop(target_node_id, None)
315
+ break
316
+
317
+ return matching_set
318
+
319
+ def _compute_leaf_matching_set(self) -> t.Set[t.Tuple[int, int]]:
320
+ candidate_matchings: t.List[t.Tuple[float, int, int, exp.Expression, exp.Expression]] = []
321
+ source_expression_leaves = list(_get_expression_leaves(self._source))
322
+ target_expression_leaves = list(_get_expression_leaves(self._target))
323
+ for source_leaf in source_expression_leaves:
324
+ for target_leaf in target_expression_leaves:
325
+ if _is_same_type(source_leaf, target_leaf):
326
+ similarity_score = self._dice_coefficient(source_leaf, target_leaf)
327
+ if similarity_score >= self.f:
328
+ heappush(
329
+ candidate_matchings,
330
+ (
331
+ -similarity_score,
332
+ -_parent_similarity_score(source_leaf, target_leaf),
333
+ len(candidate_matchings),
334
+ source_leaf,
335
+ target_leaf,
336
+ ),
337
+ )
338
+
339
+ # Pick best matchings based on the highest score
340
+ matching_set = set()
341
+ while candidate_matchings:
342
+ _, _, _, source_leaf, target_leaf = heappop(candidate_matchings)
343
+ if (
344
+ id(source_leaf) in self._unmatched_source_nodes
345
+ and id(target_leaf) in self._unmatched_target_nodes
346
+ ):
347
+ matching_set.add((id(source_leaf), id(target_leaf)))
348
+ self._unmatched_source_nodes.remove(id(source_leaf))
349
+ self._unmatched_target_nodes.remove(id(target_leaf))
350
+
351
+ return matching_set
352
+
353
+ def _dice_coefficient(self, source: exp.Expression, target: exp.Expression) -> float:
354
+ source_histo = self._bigram_histo(source)
355
+ target_histo = self._bigram_histo(target)
356
+
357
+ total_grams = sum(source_histo.values()) + sum(target_histo.values())
358
+ if not total_grams:
359
+ return 1.0 if source == target else 0.0
360
+
361
+ overlap_len = 0
362
+ overlapping_grams = set(source_histo) & set(target_histo)
363
+ for g in overlapping_grams:
364
+ overlap_len += min(source_histo[g], target_histo[g])
365
+
366
+ return 2 * overlap_len / total_grams
367
+
368
+ def _bigram_histo(self, expression: exp.Expression) -> t.DefaultDict[str, int]:
369
+ if id(expression) in self._bigram_histo_cache:
370
+ return self._bigram_histo_cache[id(expression)]
371
+
372
+ expression_str = self._sql_generator.generate(expression)
373
+ count = max(0, len(expression_str) - 1)
374
+ bigram_histo: t.DefaultDict[str, int] = defaultdict(int)
375
+ for i in range(count):
376
+ bigram_histo[expression_str[i : i + 2]] += 1
377
+
378
+ self._bigram_histo_cache[id(expression)] = bigram_histo
379
+ return bigram_histo
380
+
381
+
382
+ def _get_expression_leaves(expression: exp.Expression) -> t.Iterator[exp.Expression]:
383
+ has_child_exprs = False
384
+
385
+ for node in expression.iter_expressions():
386
+ if not isinstance(node, IGNORED_LEAF_EXPRESSION_TYPES):
387
+ has_child_exprs = True
388
+ yield from _get_expression_leaves(node)
389
+
390
+ if not has_child_exprs:
391
+ yield expression
392
+
393
+
394
+ def _get_non_expression_leaves(expression: exp.Expression) -> t.Iterator[t.Tuple[str, t.Any]]:
395
+ for arg, value in expression.args.items():
396
+ if isinstance(value, exp.Expression) or (
397
+ isinstance(value, list) and isinstance(seq_get(value, 0), exp.Expression)
398
+ ):
399
+ continue
400
+
401
+ yield (arg, value)
402
+
403
+
404
+ def _is_same_type(source: exp.Expression, target: exp.Expression) -> bool:
405
+ if type(source) is type(target):
406
+ if isinstance(source, exp.Join):
407
+ return source.args.get("side") == target.args.get("side")
408
+
409
+ if isinstance(source, exp.Anonymous):
410
+ return source.this == target.this
411
+
412
+ return True
413
+
414
+ return False
415
+
416
+
417
+ def _parent_similarity_score(
418
+ source: t.Optional[exp.Expression], target: t.Optional[exp.Expression]
419
+ ) -> int:
420
+ if source is None or target is None or type(source) is not type(target):
421
+ return 0
422
+
423
+ return 1 + _parent_similarity_score(source.parent, target.parent)
424
+
425
+
426
+ def _expression_only_args(expression: exp.Expression) -> t.Iterator[exp.Expression]:
427
+ yield from (
428
+ arg
429
+ for arg in expression.iter_expressions()
430
+ if not isinstance(arg, IGNORED_LEAF_EXPRESSION_TYPES)
431
+ )
432
+
433
+
434
+ def _lcs(
435
+ seq_a: t.Sequence[T], seq_b: t.Sequence[T], equal: t.Callable[[T, T], bool]
436
+ ) -> t.Sequence[t.Optional[T]]:
437
+ """Calculates the longest common subsequence"""
438
+
439
+ len_a = len(seq_a)
440
+ len_b = len(seq_b)
441
+ lcs_result = [[None] * (len_b + 1) for i in range(len_a + 1)]
442
+
443
+ for i in range(len_a + 1):
444
+ for j in range(len_b + 1):
445
+ if i == 0 or j == 0:
446
+ lcs_result[i][j] = [] # type: ignore
447
+ elif equal(seq_a[i - 1], seq_b[j - 1]):
448
+ lcs_result[i][j] = lcs_result[i - 1][j - 1] + [seq_a[i - 1]] # type: ignore
449
+ else:
450
+ lcs_result[i][j] = (
451
+ lcs_result[i - 1][j]
452
+ if len(lcs_result[i - 1][j]) > len(lcs_result[i][j - 1]) # type: ignore
453
+ else lcs_result[i][j - 1]
454
+ )
455
+
456
+ return lcs_result[len_a][len_b] # type: ignore
@@ -0,0 +1,93 @@
1
+ from __future__ import annotations
2
+
3
+ import typing as t
4
+ from enum import auto
5
+
6
+ from sqlglot.helper import AutoName
7
+
8
+
9
+ class ErrorLevel(AutoName):
10
+ IGNORE = auto()
11
+ """Ignore all errors."""
12
+
13
+ WARN = auto()
14
+ """Log all errors."""
15
+
16
+ RAISE = auto()
17
+ """Collect all errors and raise a single exception."""
18
+
19
+ IMMEDIATE = auto()
20
+ """Immediately raise an exception on the first error found."""
21
+
22
+
23
+ class SqlglotError(Exception):
24
+ pass
25
+
26
+
27
+ class UnsupportedError(SqlglotError):
28
+ pass
29
+
30
+
31
+ class ParseError(SqlglotError):
32
+ def __init__(
33
+ self,
34
+ message: str,
35
+ errors: t.Optional[t.List[t.Dict[str, t.Any]]] = None,
36
+ ):
37
+ super().__init__(message)
38
+ self.errors = errors or []
39
+
40
+ @classmethod
41
+ def new(
42
+ cls,
43
+ message: str,
44
+ description: t.Optional[str] = None,
45
+ line: t.Optional[int] = None,
46
+ col: t.Optional[int] = None,
47
+ start_context: t.Optional[str] = None,
48
+ highlight: t.Optional[str] = None,
49
+ end_context: t.Optional[str] = None,
50
+ into_expression: t.Optional[str] = None,
51
+ ) -> ParseError:
52
+ return cls(
53
+ message,
54
+ [
55
+ {
56
+ "description": description,
57
+ "line": line,
58
+ "col": col,
59
+ "start_context": start_context,
60
+ "highlight": highlight,
61
+ "end_context": end_context,
62
+ "into_expression": into_expression,
63
+ }
64
+ ],
65
+ )
66
+
67
+
68
+ class TokenError(SqlglotError):
69
+ pass
70
+
71
+
72
+ class OptimizeError(SqlglotError):
73
+ pass
74
+
75
+
76
+ class SchemaError(SqlglotError):
77
+ pass
78
+
79
+
80
+ class ExecuteError(SqlglotError):
81
+ pass
82
+
83
+
84
+ def concat_messages(errors: t.Sequence[t.Any], maximum: int) -> str:
85
+ msg = [str(e) for e in errors[:maximum]]
86
+ remaining = len(errors) - maximum
87
+ if remaining > 0:
88
+ msg.append(f"... and {remaining} more")
89
+ return "\n\n".join(msg)
90
+
91
+
92
+ def merge_errors(errors: t.Sequence[ParseError]) -> t.List[t.Dict[str, t.Any]]:
93
+ return [e_dict for error in errors for e_dict in error.errors]
@@ -0,0 +1,95 @@
1
+ """
2
+ .. include:: ../../posts/python_sql_engine.md
3
+
4
+ ----
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import logging
10
+ import time
11
+ import typing as t
12
+
13
+ from sqlglot import exp
14
+ from sqlglot.errors import ExecuteError
15
+ from sqlglot.executor.python import PythonExecutor
16
+ from sqlglot.executor.table import Table, ensure_tables
17
+ from sqlglot.helper import dict_depth
18
+ from sqlglot.optimizer import optimize
19
+ from sqlglot.optimizer.annotate_types import annotate_types
20
+ from sqlglot.planner import Plan
21
+ from sqlglot.schema import ensure_schema, flatten_schema, nested_get, nested_set
22
+
23
+ logger = logging.getLogger("sqlglot")
24
+
25
+ if t.TYPE_CHECKING:
26
+ from sqlglot.dialects.dialect import DialectType
27
+ from sqlglot.expressions import Expression
28
+ from sqlglot.schema import Schema
29
+
30
+
31
+ def execute(
32
+ sql: str | Expression,
33
+ schema: t.Optional[t.Dict | Schema] = None,
34
+ read: DialectType = None,
35
+ dialect: DialectType = None,
36
+ tables: t.Optional[t.Dict] = None,
37
+ ) -> Table:
38
+ """
39
+ Run a sql query against data.
40
+
41
+ Args:
42
+ sql: a sql statement.
43
+ schema: database schema.
44
+ This can either be an instance of `Schema` or a mapping in one of the following forms:
45
+ 1. {table: {col: type}}
46
+ 2. {db: {table: {col: type}}}
47
+ 3. {catalog: {db: {table: {col: type}}}}
48
+ read: the SQL dialect to apply during parsing (eg. "spark", "hive", "presto", "mysql").
49
+ dialect: the SQL dialect (alias for read).
50
+ tables: additional tables to register.
51
+
52
+ Returns:
53
+ Simple columnar data structure.
54
+ """
55
+ read = read or dialect
56
+ tables_ = ensure_tables(tables, dialect=read)
57
+
58
+ if not schema:
59
+ schema = {}
60
+ flattened_tables = flatten_schema(tables_.mapping, depth=dict_depth(tables_.mapping))
61
+
62
+ for keys in flattened_tables:
63
+ table = nested_get(tables_.mapping, *zip(keys, keys))
64
+ assert table is not None
65
+
66
+ for column in table.columns:
67
+ value = table[0][column]
68
+ column_type = (
69
+ annotate_types(exp.convert(value), dialect=read).type or type(value).__name__
70
+ )
71
+ nested_set(schema, [*keys, column], column_type)
72
+
73
+ schema = ensure_schema(schema, dialect=read)
74
+
75
+ if tables_.supported_table_args and tables_.supported_table_args != schema.supported_table_args:
76
+ raise ExecuteError("Tables must support the same table args as schema")
77
+
78
+ now = time.time()
79
+ expression = optimize(
80
+ sql, schema, leave_tables_isolated=True, infer_csv_schemas=True, dialect=read
81
+ )
82
+
83
+ logger.debug("Optimization finished: %f", time.time() - now)
84
+ logger.debug("Optimized SQL: %s", expression.sql(pretty=True))
85
+
86
+ plan = Plan(expression)
87
+
88
+ logger.debug("Logical Plan: %s", plan)
89
+
90
+ now = time.time()
91
+ result = PythonExecutor(tables=tables_).execute(plan)
92
+
93
+ logger.debug("Query finished: %f", time.time() - now)
94
+
95
+ return result