sql-glider 0.1.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sql_glider-0.1.8.dist-info/METADATA +893 -0
- sql_glider-0.1.8.dist-info/RECORD +34 -0
- sql_glider-0.1.8.dist-info/WHEEL +4 -0
- sql_glider-0.1.8.dist-info/entry_points.txt +9 -0
- sql_glider-0.1.8.dist-info/licenses/LICENSE +201 -0
- sqlglider/__init__.py +3 -0
- sqlglider/_version.py +34 -0
- sqlglider/catalog/__init__.py +30 -0
- sqlglider/catalog/base.py +99 -0
- sqlglider/catalog/databricks.py +255 -0
- sqlglider/catalog/registry.py +121 -0
- sqlglider/cli.py +1589 -0
- sqlglider/dissection/__init__.py +17 -0
- sqlglider/dissection/analyzer.py +767 -0
- sqlglider/dissection/formatters.py +222 -0
- sqlglider/dissection/models.py +112 -0
- sqlglider/global_models.py +17 -0
- sqlglider/graph/__init__.py +42 -0
- sqlglider/graph/builder.py +349 -0
- sqlglider/graph/merge.py +136 -0
- sqlglider/graph/models.py +289 -0
- sqlglider/graph/query.py +287 -0
- sqlglider/graph/serialization.py +107 -0
- sqlglider/lineage/__init__.py +10 -0
- sqlglider/lineage/analyzer.py +1631 -0
- sqlglider/lineage/formatters.py +335 -0
- sqlglider/templating/__init__.py +51 -0
- sqlglider/templating/base.py +103 -0
- sqlglider/templating/jinja.py +163 -0
- sqlglider/templating/registry.py +124 -0
- sqlglider/templating/variables.py +295 -0
- sqlglider/utils/__init__.py +11 -0
- sqlglider/utils/config.py +155 -0
- sqlglider/utils/file_utils.py +38 -0
|
@@ -0,0 +1,767 @@
|
|
|
1
|
+
"""SQL query dissection analyzer."""
|
|
2
|
+
|
|
3
|
+
from typing import List, Optional, Set, Tuple
|
|
4
|
+
|
|
5
|
+
from sqlglot import exp, parse
|
|
6
|
+
from sqlglot.errors import ParseError
|
|
7
|
+
|
|
8
|
+
from sqlglider.dissection.models import (
|
|
9
|
+
ComponentType,
|
|
10
|
+
QueryDissectionResult,
|
|
11
|
+
QueryMetadata,
|
|
12
|
+
SQLComponent,
|
|
13
|
+
)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class DissectionAnalyzer:
|
|
17
|
+
"""Analyze and dissect SQL queries into components."""
|
|
18
|
+
|
|
19
|
+
def __init__(self, sql: str, dialect: str = "spark"):
|
|
20
|
+
"""
|
|
21
|
+
Initialize the dissection analyzer.
|
|
22
|
+
|
|
23
|
+
Args:
|
|
24
|
+
sql: SQL query string (can contain multiple statements)
|
|
25
|
+
dialect: SQL dialect (default: spark)
|
|
26
|
+
|
|
27
|
+
Raises:
|
|
28
|
+
ParseError: If the SQL cannot be parsed
|
|
29
|
+
"""
|
|
30
|
+
self.sql = sql
|
|
31
|
+
self.dialect = dialect
|
|
32
|
+
|
|
33
|
+
try:
|
|
34
|
+
# Parse all statements
|
|
35
|
+
parsed = parse(sql, dialect=dialect)
|
|
36
|
+
|
|
37
|
+
# Filter out None values (can happen with empty statements or comments)
|
|
38
|
+
self.expressions: List[exp.Expression] = [
|
|
39
|
+
expr for expr in parsed if expr is not None
|
|
40
|
+
]
|
|
41
|
+
|
|
42
|
+
if not self.expressions:
|
|
43
|
+
raise ParseError("No valid SQL statements found")
|
|
44
|
+
|
|
45
|
+
except ParseError as e:
|
|
46
|
+
raise ParseError(f"Invalid SQL syntax: {e}") from e
|
|
47
|
+
|
|
48
|
+
def dissect_queries(self) -> List[QueryDissectionResult]:
|
|
49
|
+
"""
|
|
50
|
+
Dissect all queries in the SQL file.
|
|
51
|
+
|
|
52
|
+
Returns:
|
|
53
|
+
List of QueryDissectionResult objects (one per query)
|
|
54
|
+
"""
|
|
55
|
+
results = []
|
|
56
|
+
|
|
57
|
+
for query_index, expr in enumerate(self.expressions):
|
|
58
|
+
result = self._dissect_single_query(expr, query_index)
|
|
59
|
+
results.append(result)
|
|
60
|
+
|
|
61
|
+
return results
|
|
62
|
+
|
|
63
|
+
def _dissect_single_query(
|
|
64
|
+
self, expr: exp.Expression, query_index: int
|
|
65
|
+
) -> QueryDissectionResult:
|
|
66
|
+
"""
|
|
67
|
+
Dissect a single query into components.
|
|
68
|
+
|
|
69
|
+
Args:
|
|
70
|
+
expr: SQLGlot expression to dissect
|
|
71
|
+
query_index: Index of query in multi-query file
|
|
72
|
+
|
|
73
|
+
Returns:
|
|
74
|
+
QueryDissectionResult with all extracted components
|
|
75
|
+
"""
|
|
76
|
+
components: List[SQLComponent] = []
|
|
77
|
+
component_counter = 0
|
|
78
|
+
|
|
79
|
+
# Get statement type
|
|
80
|
+
stmt_type = self._get_statement_type(expr)
|
|
81
|
+
|
|
82
|
+
# Track CTE names for dependency analysis
|
|
83
|
+
cte_names: Set[str] = set()
|
|
84
|
+
|
|
85
|
+
# Extract CTEs (if present)
|
|
86
|
+
if hasattr(expr, "args") and expr.args.get("with"):
|
|
87
|
+
with_clause = expr.args["with"]
|
|
88
|
+
for cte in with_clause.expressions:
|
|
89
|
+
if isinstance(cte, exp.CTE) and cte.alias:
|
|
90
|
+
cte_name = cte.alias
|
|
91
|
+
cte_names.add(cte_name)
|
|
92
|
+
cte_sql = cte.this.sql(dialect=self.dialect)
|
|
93
|
+
|
|
94
|
+
# Extract dependencies (references to other CTEs)
|
|
95
|
+
cte_deps = self._extract_cte_dependencies(cte.this, cte_names)
|
|
96
|
+
|
|
97
|
+
components.append(
|
|
98
|
+
SQLComponent(
|
|
99
|
+
component_type=ComponentType.CTE,
|
|
100
|
+
component_index=component_counter,
|
|
101
|
+
name=cte_name,
|
|
102
|
+
sql=cte_sql,
|
|
103
|
+
parent_index=None,
|
|
104
|
+
depth=0,
|
|
105
|
+
is_executable=True,
|
|
106
|
+
dependencies=cte_deps,
|
|
107
|
+
location="WITH clause",
|
|
108
|
+
)
|
|
109
|
+
)
|
|
110
|
+
|
|
111
|
+
# Extract scalar subqueries within CTE
|
|
112
|
+
component_counter += 1
|
|
113
|
+
component_counter = self._extract_scalar_subqueries(
|
|
114
|
+
cte.this,
|
|
115
|
+
components,
|
|
116
|
+
component_counter,
|
|
117
|
+
cte_names,
|
|
118
|
+
parent_index=component_counter - 1,
|
|
119
|
+
depth=1,
|
|
120
|
+
parent_context=f"CTE '{cte_name}'",
|
|
121
|
+
)
|
|
122
|
+
|
|
123
|
+
# Extract target table (for DML/DDL)
|
|
124
|
+
target_table, target_type = self._get_target_table(expr)
|
|
125
|
+
if target_table:
|
|
126
|
+
components.append(
|
|
127
|
+
SQLComponent(
|
|
128
|
+
component_type=ComponentType.TARGET_TABLE,
|
|
129
|
+
component_index=component_counter,
|
|
130
|
+
name=target_table,
|
|
131
|
+
sql=target_table,
|
|
132
|
+
parent_index=None,
|
|
133
|
+
depth=0,
|
|
134
|
+
is_executable=False,
|
|
135
|
+
dependencies=[],
|
|
136
|
+
location=f"{target_type} target",
|
|
137
|
+
)
|
|
138
|
+
)
|
|
139
|
+
component_counter += 1
|
|
140
|
+
|
|
141
|
+
# Extract source query (for INSERT/CTAS/MERGE)
|
|
142
|
+
source_select = self._get_source_select(expr)
|
|
143
|
+
if source_select:
|
|
144
|
+
source_sql = source_select.sql(dialect=self.dialect)
|
|
145
|
+
source_deps = self._extract_cte_dependencies(source_select, cte_names)
|
|
146
|
+
source_index = component_counter
|
|
147
|
+
|
|
148
|
+
components.append(
|
|
149
|
+
SQLComponent(
|
|
150
|
+
component_type=ComponentType.SOURCE_QUERY,
|
|
151
|
+
component_index=component_counter,
|
|
152
|
+
name=None,
|
|
153
|
+
sql=source_sql,
|
|
154
|
+
parent_index=None,
|
|
155
|
+
depth=0,
|
|
156
|
+
is_executable=True,
|
|
157
|
+
dependencies=source_deps,
|
|
158
|
+
location=self._get_source_location(expr),
|
|
159
|
+
)
|
|
160
|
+
)
|
|
161
|
+
component_counter += 1
|
|
162
|
+
|
|
163
|
+
# Check if source is a UNION and extract branches
|
|
164
|
+
if self._is_union(source_select):
|
|
165
|
+
component_counter = self._extract_union_branches(
|
|
166
|
+
source_select,
|
|
167
|
+
components,
|
|
168
|
+
component_counter,
|
|
169
|
+
cte_names,
|
|
170
|
+
parent_index=source_index,
|
|
171
|
+
depth=1,
|
|
172
|
+
)
|
|
173
|
+
else:
|
|
174
|
+
# Extract subqueries from source SELECT
|
|
175
|
+
component_counter = self._extract_subqueries(
|
|
176
|
+
source_select,
|
|
177
|
+
components,
|
|
178
|
+
component_counter,
|
|
179
|
+
cte_names,
|
|
180
|
+
parent_index=source_index,
|
|
181
|
+
depth=1,
|
|
182
|
+
parent_context="SOURCE_QUERY",
|
|
183
|
+
)
|
|
184
|
+
|
|
185
|
+
# Extract scalar subqueries from source SELECT
|
|
186
|
+
component_counter = self._extract_scalar_subqueries(
|
|
187
|
+
source_select,
|
|
188
|
+
components,
|
|
189
|
+
component_counter,
|
|
190
|
+
cte_names,
|
|
191
|
+
parent_index=source_index,
|
|
192
|
+
depth=1,
|
|
193
|
+
parent_context="SOURCE_QUERY",
|
|
194
|
+
)
|
|
195
|
+
|
|
196
|
+
# Handle pure SELECT (no DML/DDL wrapper)
|
|
197
|
+
elif isinstance(expr, (exp.Select, exp.Union)):
|
|
198
|
+
main_sql = expr.sql(dialect=self.dialect)
|
|
199
|
+
main_deps = self._extract_cte_dependencies(expr, cte_names)
|
|
200
|
+
main_index = component_counter
|
|
201
|
+
|
|
202
|
+
components.append(
|
|
203
|
+
SQLComponent(
|
|
204
|
+
component_type=ComponentType.MAIN_QUERY,
|
|
205
|
+
component_index=component_counter,
|
|
206
|
+
name=None,
|
|
207
|
+
sql=main_sql,
|
|
208
|
+
parent_index=None,
|
|
209
|
+
depth=0,
|
|
210
|
+
is_executable=True,
|
|
211
|
+
dependencies=main_deps,
|
|
212
|
+
location="Top-level query",
|
|
213
|
+
)
|
|
214
|
+
)
|
|
215
|
+
component_counter += 1
|
|
216
|
+
|
|
217
|
+
# Check if main query is a UNION and extract branches
|
|
218
|
+
if self._is_union(expr):
|
|
219
|
+
component_counter = self._extract_union_branches(
|
|
220
|
+
expr,
|
|
221
|
+
components,
|
|
222
|
+
component_counter,
|
|
223
|
+
cte_names,
|
|
224
|
+
parent_index=main_index,
|
|
225
|
+
depth=1,
|
|
226
|
+
)
|
|
227
|
+
else:
|
|
228
|
+
# Extract FROM-clause subqueries from main SELECT
|
|
229
|
+
component_counter = self._extract_subqueries(
|
|
230
|
+
expr,
|
|
231
|
+
components,
|
|
232
|
+
component_counter,
|
|
233
|
+
cte_names,
|
|
234
|
+
parent_index=main_index,
|
|
235
|
+
depth=1,
|
|
236
|
+
parent_context="MAIN_QUERY",
|
|
237
|
+
)
|
|
238
|
+
|
|
239
|
+
# Extract scalar subqueries from main SELECT
|
|
240
|
+
component_counter = self._extract_scalar_subqueries(
|
|
241
|
+
expr,
|
|
242
|
+
components,
|
|
243
|
+
component_counter,
|
|
244
|
+
cte_names,
|
|
245
|
+
parent_index=main_index,
|
|
246
|
+
depth=1,
|
|
247
|
+
parent_context="MAIN_QUERY",
|
|
248
|
+
)
|
|
249
|
+
|
|
250
|
+
# Create metadata
|
|
251
|
+
preview = self._generate_query_preview(expr)
|
|
252
|
+
metadata = QueryMetadata(
|
|
253
|
+
query_index=query_index,
|
|
254
|
+
query_preview=preview,
|
|
255
|
+
statement_type=stmt_type,
|
|
256
|
+
total_components=len(components),
|
|
257
|
+
)
|
|
258
|
+
|
|
259
|
+
# Get original SQL for this query
|
|
260
|
+
original_sql = expr.sql(dialect=self.dialect)
|
|
261
|
+
|
|
262
|
+
return QueryDissectionResult(
|
|
263
|
+
metadata=metadata,
|
|
264
|
+
components=components,
|
|
265
|
+
original_sql=original_sql,
|
|
266
|
+
)
|
|
267
|
+
|
|
268
|
+
def _is_union(self, node: exp.Expression) -> bool:
|
|
269
|
+
"""Check if an expression is a UNION/UNION ALL."""
|
|
270
|
+
return isinstance(node, exp.Union)
|
|
271
|
+
|
|
272
|
+
def _extract_union_branches(
|
|
273
|
+
self,
|
|
274
|
+
union_node: exp.Expression,
|
|
275
|
+
components: List[SQLComponent],
|
|
276
|
+
component_counter: int,
|
|
277
|
+
cte_names: Set[str],
|
|
278
|
+
parent_index: int,
|
|
279
|
+
depth: int,
|
|
280
|
+
) -> int:
|
|
281
|
+
"""
|
|
282
|
+
Extract individual branches from a UNION/UNION ALL.
|
|
283
|
+
|
|
284
|
+
Args:
|
|
285
|
+
union_node: The UNION expression
|
|
286
|
+
components: List to append components to
|
|
287
|
+
component_counter: Current component index counter
|
|
288
|
+
cte_names: Set of CTE names for dependency analysis
|
|
289
|
+
parent_index: Index of parent component
|
|
290
|
+
depth: Current nesting depth
|
|
291
|
+
|
|
292
|
+
Returns:
|
|
293
|
+
Updated component_counter
|
|
294
|
+
"""
|
|
295
|
+
branches = self._flatten_union(union_node)
|
|
296
|
+
total_branches = len(branches)
|
|
297
|
+
|
|
298
|
+
for i, (branch, union_type) in enumerate(branches):
|
|
299
|
+
branch_sql = branch.sql(dialect=self.dialect)
|
|
300
|
+
branch_deps = self._extract_cte_dependencies(branch, cte_names)
|
|
301
|
+
|
|
302
|
+
# Determine location string
|
|
303
|
+
if union_type:
|
|
304
|
+
location = f"UNION branch {i + 1} of {total_branches} ({union_type})"
|
|
305
|
+
else:
|
|
306
|
+
location = f"UNION branch {i + 1} of {total_branches}"
|
|
307
|
+
|
|
308
|
+
branch_index = component_counter
|
|
309
|
+
components.append(
|
|
310
|
+
SQLComponent(
|
|
311
|
+
component_type=ComponentType.UNION_BRANCH,
|
|
312
|
+
component_index=component_counter,
|
|
313
|
+
name=f"branch_{i}",
|
|
314
|
+
sql=branch_sql,
|
|
315
|
+
parent_index=parent_index,
|
|
316
|
+
depth=depth,
|
|
317
|
+
is_executable=True,
|
|
318
|
+
dependencies=branch_deps,
|
|
319
|
+
location=location,
|
|
320
|
+
)
|
|
321
|
+
)
|
|
322
|
+
component_counter += 1
|
|
323
|
+
|
|
324
|
+
# Extract subqueries and scalar subqueries from each branch
|
|
325
|
+
component_counter = self._extract_subqueries(
|
|
326
|
+
branch,
|
|
327
|
+
components,
|
|
328
|
+
component_counter,
|
|
329
|
+
cte_names,
|
|
330
|
+
parent_index=branch_index,
|
|
331
|
+
depth=depth + 1,
|
|
332
|
+
parent_context=f"UNION_BRANCH '{f'branch_{i}'}'",
|
|
333
|
+
)
|
|
334
|
+
|
|
335
|
+
component_counter = self._extract_scalar_subqueries(
|
|
336
|
+
branch,
|
|
337
|
+
components,
|
|
338
|
+
component_counter,
|
|
339
|
+
cte_names,
|
|
340
|
+
parent_index=branch_index,
|
|
341
|
+
depth=depth + 1,
|
|
342
|
+
parent_context=f"UNION_BRANCH '{f'branch_{i}'}'",
|
|
343
|
+
)
|
|
344
|
+
|
|
345
|
+
return component_counter
|
|
346
|
+
|
|
347
|
+
def _flatten_union(
|
|
348
|
+
self, node: exp.Expression
|
|
349
|
+
) -> List[Tuple[exp.Expression, Optional[str]]]:
|
|
350
|
+
"""
|
|
351
|
+
Flatten a UNION tree into a list of (branch, union_type) tuples.
|
|
352
|
+
|
|
353
|
+
Args:
|
|
354
|
+
node: The UNION expression to flatten
|
|
355
|
+
|
|
356
|
+
Returns:
|
|
357
|
+
List of (SELECT expression, union_type) tuples where union_type
|
|
358
|
+
is 'UNION', 'UNION ALL', or None for the first branch
|
|
359
|
+
"""
|
|
360
|
+
branches: List[Tuple[exp.Expression, Optional[str]]] = []
|
|
361
|
+
|
|
362
|
+
def collect_branches(
|
|
363
|
+
n: exp.Expression,
|
|
364
|
+
is_right: bool = False,
|
|
365
|
+
parent_union_type: Optional[str] = None,
|
|
366
|
+
) -> None:
|
|
367
|
+
if isinstance(n, exp.Union):
|
|
368
|
+
# Determine union type (UNION vs UNION ALL)
|
|
369
|
+
union_type = "UNION ALL" if n.args.get("distinct") is False else "UNION"
|
|
370
|
+
|
|
371
|
+
# Process left branch first
|
|
372
|
+
collect_branches(n.this, is_right=False, parent_union_type=None)
|
|
373
|
+
|
|
374
|
+
# Process right branch with the union type
|
|
375
|
+
collect_branches(
|
|
376
|
+
n.expression, is_right=True, parent_union_type=union_type
|
|
377
|
+
)
|
|
378
|
+
else:
|
|
379
|
+
# Leaf SELECT node
|
|
380
|
+
branches.append((n, parent_union_type))
|
|
381
|
+
|
|
382
|
+
collect_branches(node)
|
|
383
|
+
return branches
|
|
384
|
+
|
|
385
|
+
def _extract_subqueries(
|
|
386
|
+
self,
|
|
387
|
+
node: exp.Expression,
|
|
388
|
+
components: List[SQLComponent],
|
|
389
|
+
component_counter: int,
|
|
390
|
+
cte_names: Set[str],
|
|
391
|
+
parent_index: int,
|
|
392
|
+
depth: int,
|
|
393
|
+
parent_context: str,
|
|
394
|
+
) -> int:
|
|
395
|
+
"""
|
|
396
|
+
Extract FROM-clause subqueries from a SELECT node.
|
|
397
|
+
|
|
398
|
+
Args:
|
|
399
|
+
node: SELECT expression to search
|
|
400
|
+
components: List to append components to
|
|
401
|
+
component_counter: Current component index counter
|
|
402
|
+
cte_names: Set of CTE names for dependency analysis
|
|
403
|
+
parent_index: Index of parent component
|
|
404
|
+
depth: Current nesting depth
|
|
405
|
+
parent_context: Description of parent for location string
|
|
406
|
+
|
|
407
|
+
Returns:
|
|
408
|
+
Updated component_counter
|
|
409
|
+
"""
|
|
410
|
+
# Find direct child subqueries in FROM clause
|
|
411
|
+
for subquery in node.find_all(exp.Subquery):
|
|
412
|
+
# Only process direct subqueries in FROM clause
|
|
413
|
+
# Skip scalar subqueries (handled separately)
|
|
414
|
+
parent = subquery.parent
|
|
415
|
+
if parent is None:
|
|
416
|
+
continue
|
|
417
|
+
|
|
418
|
+
# Check if this is a FROM-clause subquery (in From or Join)
|
|
419
|
+
is_from_subquery = False
|
|
420
|
+
current = subquery
|
|
421
|
+
while current.parent:
|
|
422
|
+
if isinstance(current.parent, (exp.From, exp.Join)):
|
|
423
|
+
is_from_subquery = True
|
|
424
|
+
break
|
|
425
|
+
if isinstance(current.parent, (exp.Select, exp.Where, exp.Having)):
|
|
426
|
+
break
|
|
427
|
+
current = current.parent
|
|
428
|
+
|
|
429
|
+
if not is_from_subquery:
|
|
430
|
+
continue
|
|
431
|
+
|
|
432
|
+
# Skip if already processed at a shallower level
|
|
433
|
+
if self._is_nested_in_already_extracted(subquery, components):
|
|
434
|
+
continue
|
|
435
|
+
|
|
436
|
+
subquery_sql = subquery.this.sql(dialect=self.dialect)
|
|
437
|
+
subquery_alias = subquery.alias or f"subquery_{component_counter}"
|
|
438
|
+
subquery_deps = self._extract_cte_dependencies(subquery.this, cte_names)
|
|
439
|
+
|
|
440
|
+
current_index = component_counter
|
|
441
|
+
components.append(
|
|
442
|
+
SQLComponent(
|
|
443
|
+
component_type=ComponentType.SUBQUERY,
|
|
444
|
+
component_index=component_counter,
|
|
445
|
+
name=subquery_alias,
|
|
446
|
+
sql=subquery_sql,
|
|
447
|
+
parent_index=parent_index,
|
|
448
|
+
depth=depth,
|
|
449
|
+
is_executable=True,
|
|
450
|
+
dependencies=subquery_deps,
|
|
451
|
+
location=f"FROM clause in {parent_context}",
|
|
452
|
+
)
|
|
453
|
+
)
|
|
454
|
+
component_counter += 1
|
|
455
|
+
|
|
456
|
+
# Recursively extract nested subqueries
|
|
457
|
+
if isinstance(subquery.this, exp.Select):
|
|
458
|
+
component_counter = self._extract_subqueries(
|
|
459
|
+
subquery.this,
|
|
460
|
+
components,
|
|
461
|
+
component_counter,
|
|
462
|
+
cte_names,
|
|
463
|
+
parent_index=current_index,
|
|
464
|
+
depth=depth + 1,
|
|
465
|
+
parent_context=f"SUBQUERY '{subquery_alias}'",
|
|
466
|
+
)
|
|
467
|
+
|
|
468
|
+
component_counter = self._extract_scalar_subqueries(
|
|
469
|
+
subquery.this,
|
|
470
|
+
components,
|
|
471
|
+
component_counter,
|
|
472
|
+
cte_names,
|
|
473
|
+
parent_index=current_index,
|
|
474
|
+
depth=depth + 1,
|
|
475
|
+
parent_context=f"SUBQUERY '{subquery_alias}'",
|
|
476
|
+
)
|
|
477
|
+
|
|
478
|
+
return component_counter
|
|
479
|
+
|
|
480
|
+
def _extract_scalar_subqueries(
|
|
481
|
+
self,
|
|
482
|
+
node: exp.Expression,
|
|
483
|
+
components: List[SQLComponent],
|
|
484
|
+
component_counter: int,
|
|
485
|
+
cte_names: Set[str],
|
|
486
|
+
parent_index: int,
|
|
487
|
+
depth: int,
|
|
488
|
+
parent_context: str,
|
|
489
|
+
) -> int:
|
|
490
|
+
"""
|
|
491
|
+
Extract scalar subqueries from SELECT list, WHERE, HAVING clauses.
|
|
492
|
+
|
|
493
|
+
Args:
|
|
494
|
+
node: Expression to search for scalar subqueries
|
|
495
|
+
components: List to append components to
|
|
496
|
+
component_counter: Current component index counter
|
|
497
|
+
cte_names: Set of CTE names for dependency analysis
|
|
498
|
+
parent_index: Index of parent component
|
|
499
|
+
depth: Current nesting depth
|
|
500
|
+
parent_context: Description of parent for location string
|
|
501
|
+
|
|
502
|
+
Returns:
|
|
503
|
+
Updated component_counter
|
|
504
|
+
"""
|
|
505
|
+
# Look for subqueries that are NOT in FROM clause
|
|
506
|
+
for subquery in node.find_all(exp.Subquery):
|
|
507
|
+
# Skip if already processed at a shallower level
|
|
508
|
+
if self._is_nested_in_already_extracted(subquery, components):
|
|
509
|
+
continue
|
|
510
|
+
|
|
511
|
+
# Determine location context
|
|
512
|
+
location_context = self._get_scalar_subquery_location(
|
|
513
|
+
subquery, parent_context
|
|
514
|
+
)
|
|
515
|
+
if location_context is None:
|
|
516
|
+
# This is a FROM-clause subquery, skip
|
|
517
|
+
continue
|
|
518
|
+
|
|
519
|
+
subquery_sql = subquery.this.sql(dialect=self.dialect)
|
|
520
|
+
# For scalar subqueries, try to get the alias from the parent column
|
|
521
|
+
subquery_name = self._get_scalar_subquery_name(subquery)
|
|
522
|
+
subquery_deps = self._extract_cte_dependencies(subquery.this, cte_names)
|
|
523
|
+
|
|
524
|
+
current_index = component_counter
|
|
525
|
+
components.append(
|
|
526
|
+
SQLComponent(
|
|
527
|
+
component_type=ComponentType.SCALAR_SUBQUERY,
|
|
528
|
+
component_index=component_counter,
|
|
529
|
+
name=subquery_name,
|
|
530
|
+
sql=subquery_sql,
|
|
531
|
+
parent_index=parent_index,
|
|
532
|
+
depth=depth,
|
|
533
|
+
is_executable=True,
|
|
534
|
+
dependencies=subquery_deps,
|
|
535
|
+
location=location_context,
|
|
536
|
+
)
|
|
537
|
+
)
|
|
538
|
+
component_counter += 1
|
|
539
|
+
|
|
540
|
+
# Recursively extract nested subqueries
|
|
541
|
+
if isinstance(subquery.this, exp.Select):
|
|
542
|
+
component_counter = self._extract_scalar_subqueries(
|
|
543
|
+
subquery.this,
|
|
544
|
+
components,
|
|
545
|
+
component_counter,
|
|
546
|
+
cte_names,
|
|
547
|
+
parent_index=current_index,
|
|
548
|
+
depth=depth + 1,
|
|
549
|
+
parent_context="SCALAR_SUBQUERY",
|
|
550
|
+
)
|
|
551
|
+
|
|
552
|
+
return component_counter
|
|
553
|
+
|
|
554
|
+
def _get_scalar_subquery_location(
|
|
555
|
+
self, subquery: exp.Subquery, parent_context: str
|
|
556
|
+
) -> Optional[str]:
|
|
557
|
+
"""
|
|
558
|
+
Determine the location of a scalar subquery.
|
|
559
|
+
|
|
560
|
+
Returns None if this is a FROM-clause subquery.
|
|
561
|
+
"""
|
|
562
|
+
current = subquery
|
|
563
|
+
while current.parent:
|
|
564
|
+
parent = current.parent
|
|
565
|
+
|
|
566
|
+
# If we hit FROM or JOIN before SELECT/WHERE/HAVING, it's not scalar
|
|
567
|
+
if isinstance(parent, (exp.From, exp.Join)):
|
|
568
|
+
return None
|
|
569
|
+
|
|
570
|
+
# Found in SELECT list
|
|
571
|
+
if isinstance(parent, exp.Select):
|
|
572
|
+
# Check if subquery is in the expressions (SELECT list)
|
|
573
|
+
if current in getattr(parent, "expressions", []):
|
|
574
|
+
return f"SELECT list in {parent_context}"
|
|
575
|
+
|
|
576
|
+
# Found in WHERE clause
|
|
577
|
+
if isinstance(parent, exp.Where):
|
|
578
|
+
return f"WHERE clause in {parent_context}"
|
|
579
|
+
|
|
580
|
+
# Found in HAVING clause
|
|
581
|
+
if isinstance(parent, exp.Having):
|
|
582
|
+
return f"HAVING clause in {parent_context}"
|
|
583
|
+
|
|
584
|
+
# Found in comparison or other expression
|
|
585
|
+
if isinstance(
|
|
586
|
+
parent,
|
|
587
|
+
(
|
|
588
|
+
exp.EQ,
|
|
589
|
+
exp.GT,
|
|
590
|
+
exp.GTE,
|
|
591
|
+
exp.LT,
|
|
592
|
+
exp.LTE,
|
|
593
|
+
exp.NEQ,
|
|
594
|
+
exp.In,
|
|
595
|
+
exp.Between,
|
|
596
|
+
),
|
|
597
|
+
):
|
|
598
|
+
# Continue up to find WHERE/HAVING/SELECT
|
|
599
|
+
pass
|
|
600
|
+
|
|
601
|
+
current = parent
|
|
602
|
+
|
|
603
|
+
# Default - assume it's in the query somewhere
|
|
604
|
+
return f"Expression in {parent_context}"
|
|
605
|
+
|
|
606
|
+
def _get_scalar_subquery_name(self, subquery: exp.Subquery) -> Optional[str]:
|
|
607
|
+
"""Get the alias/name for a scalar subquery if available."""
|
|
608
|
+
# Check if subquery has a direct alias
|
|
609
|
+
if subquery.alias:
|
|
610
|
+
return subquery.alias
|
|
611
|
+
|
|
612
|
+
# Check if parent is a column alias
|
|
613
|
+
if subquery.parent and isinstance(subquery.parent, exp.Alias):
|
|
614
|
+
return subquery.parent.alias
|
|
615
|
+
|
|
616
|
+
return None
|
|
617
|
+
|
|
618
|
+
def _is_nested_in_already_extracted(
|
|
619
|
+
self, subquery: exp.Subquery, components: List[SQLComponent]
|
|
620
|
+
) -> bool:
|
|
621
|
+
"""Check if this subquery is nested inside an already-extracted subquery."""
|
|
622
|
+
# Get the SQL of this subquery
|
|
623
|
+
subquery_sql = subquery.this.sql(dialect=self.dialect)
|
|
624
|
+
|
|
625
|
+
# Check if any existing component's SQL contains this subquery's SQL
|
|
626
|
+
# (but is not exactly equal to it)
|
|
627
|
+
for comp in components:
|
|
628
|
+
if comp.component_type in (
|
|
629
|
+
ComponentType.SUBQUERY,
|
|
630
|
+
ComponentType.SCALAR_SUBQUERY,
|
|
631
|
+
):
|
|
632
|
+
if subquery_sql in comp.sql and subquery_sql != comp.sql:
|
|
633
|
+
return True
|
|
634
|
+
|
|
635
|
+
return False
|
|
636
|
+
|
|
637
|
+
def _extract_cte_dependencies(
|
|
638
|
+
self, node: exp.Expression, cte_names: Set[str]
|
|
639
|
+
) -> List[str]:
|
|
640
|
+
"""
|
|
641
|
+
Extract CTE dependencies from an expression.
|
|
642
|
+
|
|
643
|
+
Args:
|
|
644
|
+
node: SQLGlot expression to analyze
|
|
645
|
+
cte_names: Set of CTE names defined in the query
|
|
646
|
+
|
|
647
|
+
Returns:
|
|
648
|
+
Sorted list of CTE names this expression depends on
|
|
649
|
+
"""
|
|
650
|
+
dependencies: Set[str] = set()
|
|
651
|
+
|
|
652
|
+
# Find all table references
|
|
653
|
+
for table_node in node.find_all(exp.Table):
|
|
654
|
+
table_name = table_node.name
|
|
655
|
+
if table_name in cte_names:
|
|
656
|
+
dependencies.add(table_name)
|
|
657
|
+
|
|
658
|
+
return sorted(dependencies)
|
|
659
|
+
|
|
660
|
+
def _get_target_table(
|
|
661
|
+
self, expr: exp.Expression
|
|
662
|
+
) -> Tuple[Optional[str], Optional[str]]:
|
|
663
|
+
"""
|
|
664
|
+
Get target table name and statement type for DML/DDL.
|
|
665
|
+
|
|
666
|
+
Returns:
|
|
667
|
+
Tuple of (table_name, location_prefix) or (None, None)
|
|
668
|
+
"""
|
|
669
|
+
if isinstance(expr, exp.Insert):
|
|
670
|
+
target = expr.this
|
|
671
|
+
if isinstance(target, exp.Table):
|
|
672
|
+
return (self._get_qualified_table_name(target), "INSERT INTO")
|
|
673
|
+
|
|
674
|
+
elif isinstance(expr, exp.Create):
|
|
675
|
+
kind = getattr(expr, "kind", "").upper()
|
|
676
|
+
target = expr.this
|
|
677
|
+
if isinstance(target, exp.Schema):
|
|
678
|
+
target = target.this
|
|
679
|
+
if isinstance(target, exp.Table):
|
|
680
|
+
return (self._get_qualified_table_name(target), f"CREATE {kind}")
|
|
681
|
+
|
|
682
|
+
elif isinstance(expr, exp.Merge):
|
|
683
|
+
target = expr.this
|
|
684
|
+
if isinstance(target, exp.Table):
|
|
685
|
+
return (self._get_qualified_table_name(target), "MERGE INTO")
|
|
686
|
+
|
|
687
|
+
return (None, None)
|
|
688
|
+
|
|
689
|
+
def _get_source_select(self, expr: exp.Expression) -> Optional[exp.Expression]:
|
|
690
|
+
"""
|
|
691
|
+
Get the source SELECT/UNION from INSERT/CTAS/MERGE.
|
|
692
|
+
|
|
693
|
+
Returns:
|
|
694
|
+
SELECT/UNION expression or None
|
|
695
|
+
"""
|
|
696
|
+
if isinstance(expr, exp.Insert):
|
|
697
|
+
source = expr.expression
|
|
698
|
+
if isinstance(source, (exp.Select, exp.Union)):
|
|
699
|
+
return source
|
|
700
|
+
|
|
701
|
+
elif isinstance(expr, exp.Create):
|
|
702
|
+
source = expr.expression
|
|
703
|
+
if isinstance(source, (exp.Select, exp.Union)):
|
|
704
|
+
return source
|
|
705
|
+
|
|
706
|
+
elif isinstance(expr, exp.Merge):
|
|
707
|
+
# For MERGE, find the USING clause's SELECT/UNION
|
|
708
|
+
using = expr.args.get("using")
|
|
709
|
+
if using:
|
|
710
|
+
if isinstance(using, (exp.Select, exp.Union)):
|
|
711
|
+
return using
|
|
712
|
+
elif isinstance(using, exp.Subquery):
|
|
713
|
+
return using.this
|
|
714
|
+
# Fallback: find all SELECTs
|
|
715
|
+
select_nodes = list(expr.find_all(exp.Select))
|
|
716
|
+
if select_nodes:
|
|
717
|
+
return select_nodes[0]
|
|
718
|
+
|
|
719
|
+
return None
|
|
720
|
+
|
|
721
|
+
def _get_source_location(self, expr: exp.Expression) -> str:
|
|
722
|
+
"""Get the location description for source query."""
|
|
723
|
+
if isinstance(expr, exp.Insert):
|
|
724
|
+
return "INSERT source SELECT"
|
|
725
|
+
elif isinstance(expr, exp.Create):
|
|
726
|
+
kind = getattr(expr, "kind", "").upper()
|
|
727
|
+
return f"CREATE {kind} AS SELECT"
|
|
728
|
+
elif isinstance(expr, exp.Merge):
|
|
729
|
+
return "MERGE USING clause"
|
|
730
|
+
return "Source SELECT"
|
|
731
|
+
|
|
732
|
+
def _get_qualified_table_name(self, table: exp.Table) -> str:
|
|
733
|
+
"""Get fully qualified table name."""
|
|
734
|
+
parts = []
|
|
735
|
+
if table.catalog:
|
|
736
|
+
parts.append(table.catalog)
|
|
737
|
+
if table.db:
|
|
738
|
+
parts.append(table.db)
|
|
739
|
+
parts.append(table.name)
|
|
740
|
+
return ".".join(parts)
|
|
741
|
+
|
|
742
|
+
def _get_statement_type(self, expr: exp.Expression) -> str:
|
|
743
|
+
"""Get human-readable statement type."""
|
|
744
|
+
expr_type = type(expr).__name__
|
|
745
|
+
|
|
746
|
+
if isinstance(expr, exp.Create):
|
|
747
|
+
kind = getattr(expr, "kind", "")
|
|
748
|
+
return f"CREATE {kind}".strip()
|
|
749
|
+
|
|
750
|
+
type_map = {
|
|
751
|
+
"Select": "SELECT",
|
|
752
|
+
"Insert": "INSERT",
|
|
753
|
+
"Merge": "MERGE",
|
|
754
|
+
"Update": "UPDATE",
|
|
755
|
+
"Delete": "DELETE",
|
|
756
|
+
"Union": "SELECT", # UNION is still a SELECT-type query
|
|
757
|
+
}
|
|
758
|
+
|
|
759
|
+
return type_map.get(expr_type, expr_type.upper())
|
|
760
|
+
|
|
761
|
+
def _generate_query_preview(self, expr: exp.Expression) -> str:
|
|
762
|
+
"""Generate preview string (first 100 chars)."""
|
|
763
|
+
query_text = expr.sql(dialect=self.dialect)
|
|
764
|
+
preview = " ".join(query_text.split())[:100]
|
|
765
|
+
if len(" ".join(query_text.split())) > 100:
|
|
766
|
+
preview += "..."
|
|
767
|
+
return preview
|