recce-nightly 1.2.0.20250506__py3-none-any.whl → 1.4.0.20250514__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of recce-nightly might be problematic. Click here for more details.
- recce/VERSION +1 -1
- recce/__init__.py +22 -22
- recce/adapter/base.py +11 -14
- recce/adapter/dbt_adapter/__init__.py +355 -316
- recce/adapter/dbt_adapter/dbt_version.py +3 -0
- recce/adapter/sqlmesh_adapter.py +24 -35
- recce/apis/check_api.py +39 -28
- recce/apis/check_func.py +33 -27
- recce/apis/run_api.py +25 -19
- recce/apis/run_func.py +29 -23
- recce/artifact.py +44 -49
- recce/cli.py +484 -285
- recce/config.py +42 -33
- recce/core.py +52 -44
- recce/data/404.html +1 -1
- recce/data/_next/static/chunks/{368-7587b306577df275.js → 778-aef312bffb4c0312.js} +15 -15
- recce/data/_next/static/chunks/8d700b6a.ed11a130057c7a47.js +1 -0
- recce/data/_next/static/chunks/app/layout-c713a2829d3279e4.js +1 -0
- recce/data/_next/static/chunks/app/page-7086764277331fcb.js +1 -0
- recce/data/_next/static/chunks/{cd9f8d63-cf0d5a7b0f7a92e8.js → cd9f8d63-e020f408095ed77c.js} +3 -3
- recce/data/_next/static/chunks/webpack-b787cb1a4f2293de.js +1 -0
- recce/data/_next/static/css/88b8abc134cfd59a.css +3 -0
- recce/data/index.html +2 -2
- recce/data/index.txt +2 -2
- recce/diff.py +6 -12
- recce/event/__init__.py +74 -72
- recce/event/collector.py +27 -20
- recce/event/track.py +39 -27
- recce/exceptions.py +1 -1
- recce/git.py +7 -7
- recce/github.py +57 -53
- recce/models/__init__.py +1 -1
- recce/models/check.py +6 -7
- recce/models/run.py +1 -0
- recce/models/types.py +27 -27
- recce/pull_request.py +26 -24
- recce/run.py +148 -111
- recce/server.py +105 -88
- recce/state.py +209 -177
- recce/summary.py +168 -143
- recce/tasks/__init__.py +3 -3
- recce/tasks/core.py +11 -13
- recce/tasks/dataframe.py +19 -17
- recce/tasks/histogram.py +69 -34
- recce/tasks/lineage.py +2 -2
- recce/tasks/profile.py +152 -86
- recce/tasks/query.py +139 -87
- recce/tasks/rowcount.py +33 -30
- recce/tasks/schema.py +14 -14
- recce/tasks/top_k.py +35 -35
- recce/tasks/valuediff.py +216 -152
- recce/util/breaking.py +77 -84
- recce/util/cll.py +55 -51
- recce/util/io.py +19 -17
- recce/util/logger.py +1 -1
- recce/util/recce_cloud.py +70 -72
- recce/util/singleton.py +4 -4
- recce/yaml/__init__.py +7 -10
- {recce_nightly-1.2.0.20250506.dist-info → recce_nightly-1.4.0.20250514.dist-info}/METADATA +5 -2
- recce_nightly-1.4.0.20250514.dist-info/RECORD +143 -0
- {recce_nightly-1.2.0.20250506.dist-info → recce_nightly-1.4.0.20250514.dist-info}/WHEEL +1 -1
- tests/adapter/dbt_adapter/conftest.py +1 -0
- tests/adapter/dbt_adapter/dbt_test_helper.py +28 -18
- tests/adapter/dbt_adapter/test_dbt_adapter.py +0 -15
- tests/adapter/dbt_adapter/test_dbt_cll.py +39 -32
- tests/adapter/dbt_adapter/test_selector.py +22 -21
- tests/tasks/test_histogram.py +58 -66
- tests/tasks/test_lineage.py +36 -23
- tests/tasks/test_preset_checks.py +45 -31
- tests/tasks/test_profile.py +340 -15
- tests/tasks/test_query.py +40 -40
- tests/tasks/test_row_count.py +65 -46
- tests/tasks/test_schema.py +65 -42
- tests/tasks/test_top_k.py +22 -18
- tests/tasks/test_valuediff.py +43 -32
- tests/test_cli.py +71 -58
- tests/test_config.py +7 -9
- tests/test_core.py +5 -3
- tests/test_dbt.py +7 -7
- tests/test_pull_request.py +1 -1
- tests/test_server.py +19 -13
- tests/test_state.py +40 -27
- tests/test_summary.py +18 -14
- recce/data/_next/static/chunks/8d700b6a-f0b1f6b9e0d97ce2.js +0 -1
- recce/data/_next/static/chunks/app/layout-9102e22cb73f74d6.js +0 -1
- recce/data/_next/static/chunks/app/page-cee661090afbd6aa.js +0 -1
- recce/data/_next/static/chunks/webpack-567d72f0bc0820d5.js +0 -1
- recce_nightly-1.2.0.20250506.dist-info/RECORD +0 -142
- /recce/data/_next/static/{Kcbs3GEIyH2LxgLYat0es → E_HPXsXdrqHg2YEHmU3mK}/_buildManifest.js +0 -0
- /recce/data/_next/static/{Kcbs3GEIyH2LxgLYat0es → E_HPXsXdrqHg2YEHmU3mK}/_ssgManifest.js +0 -0
- {recce_nightly-1.2.0.20250506.dist-info → recce_nightly-1.4.0.20250514.dist-info}/entry_points.txt +0 -0
- {recce_nightly-1.2.0.20250506.dist-info → recce_nightly-1.4.0.20250514.dist-info}/licenses/LICENSE +0 -0
- {recce_nightly-1.2.0.20250506.dist-info → recce_nightly-1.4.0.20250514.dist-info}/top_level.txt +0 -0
recce/util/breaking.py
CHANGED
|
@@ -3,15 +3,15 @@ from dataclasses import dataclass
|
|
|
3
3
|
from typing import Optional
|
|
4
4
|
|
|
5
5
|
import sqlglot.expressions as exp
|
|
6
|
-
from sqlglot import
|
|
6
|
+
from sqlglot import Dialect, parse_one
|
|
7
7
|
from sqlglot.errors import SqlglotError
|
|
8
|
-
from sqlglot.optimizer import
|
|
8
|
+
from sqlglot.optimizer import Scope, traverse_scope
|
|
9
9
|
from sqlglot.optimizer.qualify import qualify
|
|
10
10
|
|
|
11
|
-
from recce.models.types import
|
|
11
|
+
from recce.models.types import ChangeStatus, NodeChange
|
|
12
12
|
|
|
13
|
-
CHANGE_CATEGORY_UNKNOWN = NodeChange(category=
|
|
14
|
-
CHANGE_CATEGORY_BREAKING = NodeChange(category=
|
|
13
|
+
CHANGE_CATEGORY_UNKNOWN = NodeChange(category="unknown")
|
|
14
|
+
CHANGE_CATEGORY_BREAKING = NodeChange(category="breaking")
|
|
15
15
|
|
|
16
16
|
|
|
17
17
|
@dataclass
|
|
@@ -48,11 +48,11 @@ class BreakingPerformanceTracking:
|
|
|
48
48
|
|
|
49
49
|
def to_dict(self):
|
|
50
50
|
return {
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
51
|
+
"lineage_diff_elapsed_ms": self.lineage_diff_elapsed,
|
|
52
|
+
"modified_nodes": self.modified_nodes,
|
|
53
|
+
"sqlglot_error_nodes": self.sqlglot_error_nodes,
|
|
54
|
+
"other_error_nodes": self.other_error_nodes,
|
|
55
|
+
"checkpoints": self.checkpoints,
|
|
56
56
|
}
|
|
57
57
|
|
|
58
58
|
def reset(self):
|
|
@@ -64,33 +64,29 @@ class BreakingPerformanceTracking:
|
|
|
64
64
|
self.checkpoints = {}
|
|
65
65
|
|
|
66
66
|
|
|
67
|
-
def _diff_select_scope(
|
|
68
|
-
old_scope
|
|
69
|
-
new_scope
|
|
70
|
-
scope_changes_map: dict[Scope, NodeChange]
|
|
71
|
-
) -> NodeChange:
|
|
72
|
-
assert old_scope.expression.key == 'select'
|
|
73
|
-
assert new_scope.expression.key == 'select'
|
|
67
|
+
def _diff_select_scope(old_scope: Scope, new_scope: Scope, scope_changes_map: dict[Scope, NodeChange]) -> NodeChange:
|
|
68
|
+
assert old_scope.expression.key == "select"
|
|
69
|
+
assert new_scope.expression.key == "select"
|
|
74
70
|
|
|
75
|
-
change_category =
|
|
71
|
+
change_category = "non_breaking"
|
|
76
72
|
changed_columns = {}
|
|
77
73
|
|
|
78
74
|
# check if the upstream scopes is not breaking
|
|
79
75
|
for source_name, source in new_scope.sources.items():
|
|
80
76
|
if scope_changes_map.get(source) is not None:
|
|
81
77
|
chanage = scope_changes_map[source]
|
|
82
|
-
if chanage.category ==
|
|
83
|
-
change_category =
|
|
78
|
+
if chanage.category == "breaking":
|
|
79
|
+
change_category = "breaking"
|
|
84
80
|
|
|
85
81
|
# check if non-select expressions are the same
|
|
86
82
|
old_select = old_scope.expression # type: exp.Select
|
|
87
83
|
new_select = new_scope.expression # type: exp.Select
|
|
88
84
|
for arg_key in old_select.args.keys() | new_select.args.keys():
|
|
89
|
-
if arg_key in [
|
|
85
|
+
if arg_key in ["expressions", "with", "from"]:
|
|
90
86
|
continue
|
|
91
87
|
|
|
92
88
|
if old_select.args.get(arg_key) != new_select.args.get(arg_key):
|
|
93
|
-
change_category =
|
|
89
|
+
change_category = "breaking"
|
|
94
90
|
|
|
95
91
|
def source_column_change_status(ref_column: exp.Column) -> Optional[ChangeStatus]:
|
|
96
92
|
table_name = ref_column.table
|
|
@@ -108,9 +104,10 @@ def _diff_select_scope(
|
|
|
108
104
|
# selects
|
|
109
105
|
old_column_map = {projection.alias_or_name: projection for projection in old_select.selects}
|
|
110
106
|
new_column_map = {projection.alias_or_name: projection for projection in new_select.selects}
|
|
111
|
-
is_distinct = new_select.args.get(
|
|
107
|
+
is_distinct = new_select.args.get("distinct") is not None
|
|
108
|
+
|
|
109
|
+
for column_name in old_column_map.keys() | new_column_map.keys():
|
|
112
110
|
|
|
113
|
-
for column_name in (old_column_map.keys() | new_column_map.keys()):
|
|
114
111
|
def _has_udtf(expr: exp.Expression) -> bool:
|
|
115
112
|
return expr.find(exp.UDTF) is not None
|
|
116
113
|
|
|
@@ -124,116 +121,112 @@ def _diff_select_scope(
|
|
|
124
121
|
new_column = new_column_map.get(column_name)
|
|
125
122
|
if old_column is None:
|
|
126
123
|
if is_distinct:
|
|
127
|
-
change_category =
|
|
124
|
+
change_category = "breaking"
|
|
128
125
|
elif _has_udtf(new_column):
|
|
129
|
-
change_category =
|
|
126
|
+
change_category = "breaking"
|
|
130
127
|
|
|
131
|
-
changed_columns[column_name] =
|
|
128
|
+
changed_columns[column_name] = "added"
|
|
132
129
|
elif new_column is None:
|
|
133
130
|
if is_distinct:
|
|
134
|
-
change_category =
|
|
131
|
+
change_category = "breaking"
|
|
135
132
|
elif _has_udtf(old_column):
|
|
136
|
-
change_category =
|
|
133
|
+
change_category = "breaking"
|
|
137
134
|
|
|
138
|
-
changed_columns[column_name] =
|
|
139
|
-
if change_category !=
|
|
140
|
-
change_category =
|
|
135
|
+
changed_columns[column_name] = "removed"
|
|
136
|
+
if change_category != "breaking":
|
|
137
|
+
change_category = "partial_breaking"
|
|
141
138
|
elif old_column != new_column:
|
|
142
139
|
if is_distinct:
|
|
143
|
-
change_category =
|
|
140
|
+
change_category = "breaking"
|
|
144
141
|
elif _has_udtf(old_column) and _has_udtf(new_column):
|
|
145
|
-
change_category =
|
|
142
|
+
change_category = "breaking"
|
|
146
143
|
elif _has_aggregate(old_column) != _has_aggregate(new_column):
|
|
147
|
-
change_category =
|
|
144
|
+
change_category = "breaking"
|
|
148
145
|
|
|
149
|
-
changed_columns[column_name] =
|
|
150
|
-
if change_category !=
|
|
151
|
-
change_category =
|
|
146
|
+
changed_columns[column_name] = "modified"
|
|
147
|
+
if change_category != "breaking":
|
|
148
|
+
change_category = "partial_breaking"
|
|
152
149
|
else:
|
|
153
150
|
if _has_star(new_column):
|
|
154
151
|
for source_name, (_, source) in new_scope.selected_sources.items():
|
|
155
152
|
change = scope_changes_map.get(source)
|
|
156
153
|
if change is not None:
|
|
157
|
-
if change.category ==
|
|
158
|
-
change_category =
|
|
154
|
+
if change.category == "breaking":
|
|
155
|
+
change_category = "breaking"
|
|
159
156
|
for sub_column_name in change.columns.keys():
|
|
160
157
|
column_change_status = change.columns[sub_column_name]
|
|
161
158
|
changed_columns[sub_column_name] = column_change_status
|
|
162
|
-
if change_category !=
|
|
163
|
-
change_category =
|
|
159
|
+
if change_category != "breaking" and column_change_status in ["removed", "modified"]:
|
|
160
|
+
change_category = "partial_breaking"
|
|
164
161
|
continue
|
|
165
162
|
|
|
166
163
|
ref_columns = new_column.find_all(exp.Column)
|
|
167
164
|
for ref_column in ref_columns:
|
|
168
165
|
if source_column_change_status(ref_column) is not None:
|
|
169
166
|
if is_distinct:
|
|
170
|
-
change_category =
|
|
167
|
+
change_category = "breaking"
|
|
171
168
|
elif _has_udtf(new_column):
|
|
172
|
-
change_category =
|
|
169
|
+
change_category = "breaking"
|
|
173
170
|
|
|
174
|
-
if change_category !=
|
|
175
|
-
change_category =
|
|
176
|
-
changed_columns[column_name] =
|
|
171
|
+
if change_category != "breaking":
|
|
172
|
+
change_category = "partial_breaking"
|
|
173
|
+
changed_columns[column_name] = "modified"
|
|
177
174
|
|
|
178
175
|
def selected_column_change_status(ref_column: exp.Column) -> Optional[ChangeStatus]:
|
|
179
176
|
column_name = ref_column.name
|
|
180
177
|
return changed_columns.get(column_name)
|
|
181
178
|
|
|
182
179
|
# joins clause: Reference the source columns
|
|
183
|
-
if new_select.args.get(
|
|
184
|
-
joins = new_select.args.get(
|
|
180
|
+
if new_select.args.get("joins"):
|
|
181
|
+
joins = new_select.args.get("joins")
|
|
185
182
|
for join in joins:
|
|
186
183
|
if isinstance(join, exp.Join):
|
|
187
184
|
for ref_column in join.find_all(exp.Column):
|
|
188
185
|
if source_column_change_status(ref_column) is not None:
|
|
189
|
-
change_category =
|
|
186
|
+
change_category = "breaking"
|
|
190
187
|
|
|
191
188
|
# where clauses: Reference the source columns
|
|
192
|
-
if new_select.args.get(
|
|
193
|
-
where = new_select.args.get(
|
|
189
|
+
if new_select.args.get("where"):
|
|
190
|
+
where = new_select.args.get("where")
|
|
194
191
|
if isinstance(where, exp.Where):
|
|
195
192
|
for ref_column in where.find_all(exp.Column):
|
|
196
193
|
if source_column_change_status(ref_column) is not None:
|
|
197
|
-
change_category =
|
|
194
|
+
change_category = "breaking"
|
|
198
195
|
|
|
199
196
|
# group by clause: Reference the source columns, column index
|
|
200
|
-
if new_select.args.get(
|
|
201
|
-
group = new_select.args.get(
|
|
197
|
+
if new_select.args.get("group"):
|
|
198
|
+
group = new_select.args.get("group")
|
|
202
199
|
if isinstance(group, exp.Group):
|
|
203
200
|
for ref_column in group.find_all(exp.Column):
|
|
204
201
|
if source_column_change_status(ref_column) is not None:
|
|
205
|
-
change_category =
|
|
202
|
+
change_category = "breaking"
|
|
206
203
|
|
|
207
204
|
# having clause: Reference the source columns, selected columns
|
|
208
|
-
if new_select.args.get(
|
|
209
|
-
having = new_select.args.get(
|
|
205
|
+
if new_select.args.get("having"):
|
|
206
|
+
having = new_select.args.get("having")
|
|
210
207
|
if isinstance(having, exp.Having):
|
|
211
208
|
for ref_column in having.find_all(exp.Column):
|
|
212
209
|
if source_column_change_status(ref_column) is not None:
|
|
213
|
-
change_category =
|
|
210
|
+
change_category = "breaking"
|
|
214
211
|
elif selected_column_change_status(ref_column) is not None:
|
|
215
|
-
change_category =
|
|
212
|
+
change_category = "breaking"
|
|
216
213
|
|
|
217
214
|
# order by clause: Reference the source columns, selected columns, column index
|
|
218
|
-
if new_select.args.get(
|
|
219
|
-
order = new_select.args.get(
|
|
215
|
+
if new_select.args.get("order"):
|
|
216
|
+
order = new_select.args.get("order")
|
|
220
217
|
if isinstance(order, exp.Order):
|
|
221
218
|
for ref_column in order.find_all(exp.Column):
|
|
222
219
|
if source_column_change_status(ref_column) is not None:
|
|
223
|
-
change_category =
|
|
220
|
+
change_category = "breaking"
|
|
224
221
|
elif selected_column_change_status(ref_column) is not None:
|
|
225
|
-
change_category =
|
|
222
|
+
change_category = "breaking"
|
|
226
223
|
|
|
227
224
|
return NodeChange(category=change_category, columns=changed_columns)
|
|
228
225
|
|
|
229
226
|
|
|
230
|
-
def _diff_union_scope(
|
|
231
|
-
old_scope
|
|
232
|
-
new_scope
|
|
233
|
-
scope_changes_map: dict[Scope, NodeChange]
|
|
234
|
-
) -> NodeChange:
|
|
235
|
-
assert old_scope.expression.key == 'union'
|
|
236
|
-
assert new_scope.expression.key == 'union'
|
|
227
|
+
def _diff_union_scope(old_scope: Scope, new_scope: Scope, scope_changes_map: dict[Scope, NodeChange]) -> NodeChange:
|
|
228
|
+
assert old_scope.expression.key == "union"
|
|
229
|
+
assert new_scope.expression.key == "union"
|
|
237
230
|
assert len(old_scope.union_scopes) == len(new_scope.union_scopes)
|
|
238
231
|
assert new_scope.union_scopes is not None
|
|
239
232
|
assert len(new_scope.union_scopes) > 0
|
|
@@ -244,11 +237,11 @@ def _diff_union_scope(
|
|
|
244
237
|
|
|
245
238
|
for sub_scope in new_scope.union_scopes[1:]:
|
|
246
239
|
result_right = scope_changes_map.get(sub_scope)
|
|
247
|
-
if change_category ==
|
|
248
|
-
if result_right.category in [
|
|
240
|
+
if change_category == "partial_breaking":
|
|
241
|
+
if result_right.category in ["breaking"]:
|
|
249
242
|
change_category = result_right.category
|
|
250
|
-
elif change_category ==
|
|
251
|
-
if result_right.category in [
|
|
243
|
+
elif change_category == "non_breaking":
|
|
244
|
+
if result_right.category in ["breaking", "partial_breaking"]:
|
|
252
245
|
change_category = result_right.category
|
|
253
246
|
for column_name, column_change_status in result_right.columns.items():
|
|
254
247
|
changed_columns[column_name] = column_change_status
|
|
@@ -265,7 +258,7 @@ def parse_change_category(
|
|
|
265
258
|
perf_tracking: BreakingPerformanceTracking = None,
|
|
266
259
|
) -> NodeChange:
|
|
267
260
|
if old_sql == new_sql:
|
|
268
|
-
return NodeChange(category=
|
|
261
|
+
return NodeChange(category="non_breaking")
|
|
269
262
|
|
|
270
263
|
try:
|
|
271
264
|
dialect = Dialect.get(dialect)
|
|
@@ -294,31 +287,31 @@ def parse_change_category(
|
|
|
294
287
|
old_scopes = traverse_scope(old_exp)
|
|
295
288
|
new_scopes = traverse_scope(new_exp)
|
|
296
289
|
if len(old_scopes) != len(new_scopes):
|
|
297
|
-
return NodeChange(category=
|
|
290
|
+
return NodeChange(category="breaking", columns={})
|
|
298
291
|
|
|
299
292
|
scope_changes_map = {}
|
|
300
293
|
for old_scope, new_scope in zip(old_scopes, new_scopes):
|
|
301
294
|
if old_scope.expression.key != new_scope.expression.key:
|
|
302
|
-
scope_changes_map[new_scope] = NodeChange(category=
|
|
295
|
+
scope_changes_map[new_scope] = NodeChange(category="breaking")
|
|
303
296
|
continue
|
|
304
297
|
if old_scope == new_scope:
|
|
305
|
-
scope_changes_map[new_scope] = NodeChange(category=
|
|
298
|
+
scope_changes_map[new_scope] = NodeChange(category="non_breaking")
|
|
306
299
|
continue
|
|
307
300
|
|
|
308
301
|
scope_type = old_scope.expression.key
|
|
309
|
-
if scope_type ==
|
|
302
|
+
if scope_type == "select":
|
|
310
303
|
# CTE, Subquery, Root
|
|
311
304
|
result = _diff_select_scope(old_scope, new_scope, scope_changes_map)
|
|
312
|
-
elif scope_type ==
|
|
305
|
+
elif scope_type == "union":
|
|
313
306
|
# Union
|
|
314
307
|
result = _diff_union_scope(old_scope, new_scope, scope_changes_map)
|
|
315
308
|
else:
|
|
316
309
|
if old_scope.expression != new_scope.expression:
|
|
317
|
-
result = NodeChange(category=
|
|
310
|
+
result = NodeChange(category="breaking", columns={})
|
|
318
311
|
else:
|
|
319
|
-
result = NodeChange(category=
|
|
312
|
+
result = NodeChange(category="non_breaking", columns={})
|
|
320
313
|
|
|
321
|
-
if result.category ==
|
|
314
|
+
if result.category == "unknown":
|
|
322
315
|
return result
|
|
323
316
|
|
|
324
317
|
scope_changes_map[new_scope] = result
|
recce/util/cll.py
CHANGED
|
@@ -2,10 +2,21 @@ import time
|
|
|
2
2
|
from dataclasses import dataclass
|
|
3
3
|
from typing import Dict, List, Literal
|
|
4
4
|
|
|
5
|
-
from sqlglot import
|
|
6
|
-
from sqlglot.errors import
|
|
7
|
-
from sqlglot.expressions import
|
|
8
|
-
|
|
5
|
+
from sqlglot import Dialect, parse_one
|
|
6
|
+
from sqlglot.errors import OptimizeError, SqlglotError
|
|
7
|
+
from sqlglot.expressions import (
|
|
8
|
+
Alias,
|
|
9
|
+
Binary,
|
|
10
|
+
Case,
|
|
11
|
+
Column,
|
|
12
|
+
Expression,
|
|
13
|
+
Func,
|
|
14
|
+
If,
|
|
15
|
+
Intersect,
|
|
16
|
+
Paren,
|
|
17
|
+
Union,
|
|
18
|
+
)
|
|
19
|
+
from sqlglot.optimizer import Scope, traverse_scope
|
|
9
20
|
from sqlglot.optimizer.qualify import qualify
|
|
10
21
|
|
|
11
22
|
from recce.exceptions import RecceException
|
|
@@ -50,11 +61,11 @@ class CLLPerformanceTracking(metaclass=SingletonMeta):
|
|
|
50
61
|
|
|
51
62
|
def to_dict(self):
|
|
52
63
|
return {
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
64
|
+
"lineage_elapsed_ms": self.lineage_elapsed,
|
|
65
|
+
"column_lineage_elapsed_ms": self.column_lineage_elapsed,
|
|
66
|
+
"total_nodes": self.total_nodes,
|
|
67
|
+
"sqlglot_error_nodes": self.sqlglot_error_nodes,
|
|
68
|
+
"other_error_nodes": self.other_error_nodes,
|
|
58
69
|
}
|
|
59
70
|
|
|
60
71
|
def reset(self):
|
|
@@ -76,7 +87,7 @@ class ColumnLevelDependsOn:
|
|
|
76
87
|
|
|
77
88
|
@dataclass
|
|
78
89
|
class ColumnLevelDependencyColumn:
|
|
79
|
-
type: Literal[
|
|
90
|
+
type: Literal["source", "passthrough", "renamed", "derived"]
|
|
80
91
|
depends_on: List[ColumnLevelDependsOn]
|
|
81
92
|
|
|
82
93
|
|
|
@@ -93,10 +104,7 @@ def _cll_expression(expression, table_alias_map) -> ColumnLevelDependencyColumn:
|
|
|
93
104
|
else:
|
|
94
105
|
table = table_alias_map.get(alias, alias)
|
|
95
106
|
|
|
96
|
-
return ColumnLevelDependencyColumn(
|
|
97
|
-
type='passthrough',
|
|
98
|
-
depends_on=[ColumnLevelDependsOn(table, column.name)]
|
|
99
|
-
)
|
|
107
|
+
return ColumnLevelDependencyColumn(type="passthrough", depends_on=[ColumnLevelDependsOn(table, column.name)])
|
|
100
108
|
elif isinstance(expression, Paren):
|
|
101
109
|
return _cll_expression(expression.this, table_alias_map)
|
|
102
110
|
elif isinstance(expression, Binary):
|
|
@@ -107,31 +115,31 @@ def _cll_expression(expression, table_alias_map) -> ColumnLevelDependencyColumn:
|
|
|
107
115
|
if expression.right:
|
|
108
116
|
depends_on_right = _cll_expression(expression.right, table_alias_map).depends_on
|
|
109
117
|
depends_on.extend(depends_on_right)
|
|
110
|
-
type =
|
|
118
|
+
type = "derived" if depends_on else "source"
|
|
111
119
|
return ColumnLevelDependencyColumn(type=type, depends_on=depends_on)
|
|
112
120
|
elif isinstance(expression, Case):
|
|
113
|
-
ifs = expression.args[
|
|
114
|
-
default = expression.args[
|
|
121
|
+
ifs = expression.args["ifs"]
|
|
122
|
+
default = expression.args["default"]
|
|
115
123
|
depends_on = []
|
|
116
124
|
for expr in ifs:
|
|
117
125
|
depends_on_one = _cll_expression(expr, table_alias_map).depends_on
|
|
118
126
|
depends_on.extend(depends_on_one)
|
|
119
127
|
if default is not None:
|
|
120
128
|
depends_on.extend(_cll_expression(default, table_alias_map).depends_on)
|
|
121
|
-
type =
|
|
129
|
+
type = "derived" if depends_on else "source"
|
|
122
130
|
return ColumnLevelDependencyColumn(type=type, depends_on=depends_on)
|
|
123
131
|
elif isinstance(expression, If):
|
|
124
132
|
depends_on = []
|
|
125
133
|
if expression.this:
|
|
126
134
|
depends_on_one = _cll_expression(expression.this, table_alias_map).depends_on
|
|
127
135
|
depends_on.extend(depends_on_one)
|
|
128
|
-
if expression.args.get(
|
|
129
|
-
depends_on_one = _cll_expression(expression.args.get(
|
|
136
|
+
if expression.args.get("true"):
|
|
137
|
+
depends_on_one = _cll_expression(expression.args.get("true"), table_alias_map).depends_on
|
|
130
138
|
depends_on.extend(depends_on_one)
|
|
131
|
-
if expression.args.get(
|
|
132
|
-
depends_on_one = _cll_expression(expression.args.get(
|
|
139
|
+
if expression.args.get("false"):
|
|
140
|
+
depends_on_one = _cll_expression(expression.args.get("false"), table_alias_map).depends_on
|
|
133
141
|
depends_on.extend(depends_on_one)
|
|
134
|
-
type =
|
|
142
|
+
type = "derived" if depends_on else "source"
|
|
135
143
|
return ColumnLevelDependencyColumn(type=type, depends_on=depends_on)
|
|
136
144
|
elif isinstance(expression, Func):
|
|
137
145
|
if expression.expressions:
|
|
@@ -139,28 +147,28 @@ def _cll_expression(expression, table_alias_map) -> ColumnLevelDependencyColumn:
|
|
|
139
147
|
for expr in expression.expressions:
|
|
140
148
|
depends_on_one = _cll_expression(expr, table_alias_map).depends_on
|
|
141
149
|
depends_on.extend(depends_on_one)
|
|
142
|
-
type =
|
|
150
|
+
type = "derived" if depends_on else "source"
|
|
143
151
|
return ColumnLevelDependencyColumn(type=type, depends_on=depends_on)
|
|
144
152
|
if expression.this:
|
|
145
153
|
depends_on = _cll_expression(expression.this, table_alias_map).depends_on
|
|
146
|
-
type =
|
|
154
|
+
type = "derived" if depends_on else "source"
|
|
147
155
|
return ColumnLevelDependencyColumn(type=type, depends_on=depends_on)
|
|
148
156
|
|
|
149
|
-
return ColumnLevelDependencyColumn(type=
|
|
157
|
+
return ColumnLevelDependencyColumn(type="source", depends_on=[])
|
|
150
158
|
elif expression.this and isinstance(expression.this, Expression):
|
|
151
159
|
depends_on = _cll_expression(expression.this, table_alias_map).depends_on
|
|
152
|
-
type =
|
|
160
|
+
type = "derived" if depends_on else "source"
|
|
153
161
|
return ColumnLevelDependencyColumn(type=type, depends_on=depends_on)
|
|
154
162
|
elif expression.expressions:
|
|
155
163
|
depends_on = []
|
|
156
164
|
for expr in expression.expressions:
|
|
157
165
|
depends_on_one = _cll_expression(expr, table_alias_map).depends_on
|
|
158
166
|
depends_on.extend(depends_on_one)
|
|
159
|
-
type =
|
|
167
|
+
type = "derived" if depends_on else "source"
|
|
160
168
|
return ColumnLevelDependencyColumn(type=type, depends_on=depends_on)
|
|
161
169
|
else:
|
|
162
170
|
depends_on = []
|
|
163
|
-
return ColumnLevelDependencyColumn(type=
|
|
171
|
+
return ColumnLevelDependencyColumn(type="source", depends_on=depends_on)
|
|
164
172
|
|
|
165
173
|
|
|
166
174
|
def cll(sql, schema=None, dialect=None) -> Dict[str, ColumnLevelDependencyColumn]:
|
|
@@ -177,24 +185,21 @@ def cll(sql, schema=None, dialect=None) -> Dict[str, ColumnLevelDependencyColumn
|
|
|
177
185
|
try:
|
|
178
186
|
expression = parse_one(sql, dialect=dialect)
|
|
179
187
|
except SqlglotError as e:
|
|
180
|
-
raise RecceException(f
|
|
188
|
+
raise RecceException(f"Failed to parse SQL: {str(e)}")
|
|
181
189
|
|
|
182
190
|
try:
|
|
183
191
|
expression = qualify(expression, schema=schema, dialect=dialect)
|
|
184
192
|
except OptimizeError as e:
|
|
185
|
-
raise RecceException(f
|
|
193
|
+
raise RecceException(f"Failed to optimize SQL: {str(e)}")
|
|
186
194
|
except SqlglotError as e:
|
|
187
|
-
raise RecceException(f
|
|
195
|
+
raise RecceException(f"Failed to qualify SQL: {str(e)}")
|
|
188
196
|
|
|
189
197
|
result = {}
|
|
190
198
|
global_lineage = {}
|
|
191
199
|
for scope in traverse_scope(expression):
|
|
192
200
|
scope_lineage = {}
|
|
193
201
|
|
|
194
|
-
table_alias_map = {
|
|
195
|
-
t.alias_or_name: t.name
|
|
196
|
-
for t in scope.tables
|
|
197
|
-
}
|
|
202
|
+
table_alias_map = {t.alias_or_name: t.name for t in scope.tables}
|
|
198
203
|
|
|
199
204
|
if isinstance(scope.expression, Union) or isinstance(scope.expression, Intersect):
|
|
200
205
|
for union_scope in scope.union_scopes:
|
|
@@ -203,7 +208,7 @@ def cll(sql, schema=None, dialect=None) -> Dict[str, ColumnLevelDependencyColumn
|
|
|
203
208
|
scope_lineage[k] = v
|
|
204
209
|
else:
|
|
205
210
|
scope_lineage[k].depends_on.extend(v.depends_on)
|
|
206
|
-
scope_lineage[k].type =
|
|
211
|
+
scope_lineage[k].type = "derived"
|
|
207
212
|
else:
|
|
208
213
|
for select in scope.expression.selects:
|
|
209
214
|
# instance of Column
|
|
@@ -218,14 +223,14 @@ def cll(sql, schema=None, dialect=None) -> Dict[str, ColumnLevelDependencyColumn
|
|
|
218
223
|
col_expression = alias.this
|
|
219
224
|
column_cll = _cll_expression(col_expression, table_alias_map)
|
|
220
225
|
if (
|
|
221
|
-
column_cll
|
|
222
|
-
column_cll.type ==
|
|
223
|
-
column_cll.depends_on[0].column != alias.alias_or_name
|
|
226
|
+
column_cll
|
|
227
|
+
and column_cll.type == "passthrough"
|
|
228
|
+
and column_cll.depends_on[0].column != alias.alias_or_name
|
|
224
229
|
):
|
|
225
|
-
column_cll.type =
|
|
230
|
+
column_cll.type = "renamed"
|
|
226
231
|
else:
|
|
227
232
|
# 'select 1'
|
|
228
|
-
column_cll = ColumnLevelDependencyColumn(type=
|
|
233
|
+
column_cll = ColumnLevelDependencyColumn(type="source", depends_on=[])
|
|
229
234
|
|
|
230
235
|
cte_type = None
|
|
231
236
|
flatten_col_depends_on = []
|
|
@@ -258,24 +263,24 @@ def cll(sql, schema=None, dialect=None) -> Dict[str, ColumnLevelDependencyColumn
|
|
|
258
263
|
dedup_col_depends_on = []
|
|
259
264
|
dedup_set = set()
|
|
260
265
|
for col_dep in flatten_col_depends_on:
|
|
261
|
-
node_col = col_dep.node +
|
|
266
|
+
node_col = col_dep.node + "." + col_dep.column
|
|
262
267
|
if node_col not in dedup_set:
|
|
263
268
|
dedup_col_depends_on.append(col_dep)
|
|
264
269
|
dedup_set.add(node_col)
|
|
265
270
|
|
|
266
271
|
# transformation type
|
|
267
272
|
type = column_cll.type
|
|
268
|
-
if type ==
|
|
273
|
+
if type == "derived":
|
|
269
274
|
if len(dedup_col_depends_on) == 0:
|
|
270
|
-
type =
|
|
275
|
+
type = "source"
|
|
271
276
|
else:
|
|
272
277
|
# keep current scope type
|
|
273
278
|
pass
|
|
274
279
|
elif cte_type is not None:
|
|
275
280
|
if len(dedup_col_depends_on) > 1:
|
|
276
|
-
type =
|
|
281
|
+
type = "derived"
|
|
277
282
|
elif len(dedup_col_depends_on) == 0:
|
|
278
|
-
type =
|
|
283
|
+
type = "source"
|
|
279
284
|
else:
|
|
280
285
|
if isinstance(select, Column):
|
|
281
286
|
type = cte_type
|
|
@@ -284,13 +289,12 @@ def cll(sql, schema=None, dialect=None) -> Dict[str, ColumnLevelDependencyColumn
|
|
|
284
289
|
if column_cll.depends_on[0].column == alias.alias_or_name:
|
|
285
290
|
type = cte_type
|
|
286
291
|
else:
|
|
287
|
-
type =
|
|
292
|
+
type = "renamed" if cte_type == "passthrough" else cte_type
|
|
288
293
|
else:
|
|
289
|
-
type =
|
|
294
|
+
type = "source"
|
|
290
295
|
|
|
291
296
|
scope_lineage[select.alias_or_name] = ColumnLevelDependencyColumn(
|
|
292
|
-
type=type,
|
|
293
|
-
depends_on=dedup_col_depends_on
|
|
297
|
+
type=type, depends_on=dedup_col_depends_on
|
|
294
298
|
)
|
|
295
299
|
|
|
296
300
|
global_lineage[scope] = scope_lineage
|