sql-code-graph 1.35.1__py3-none-any.whl → 1.35.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {sql_code_graph-1.35.1.dist-info → sql_code_graph-1.35.3.dist-info}/METADATA +1 -1
- {sql_code_graph-1.35.1.dist-info → sql_code_graph-1.35.3.dist-info}/RECORD +9 -8
- sqlcg/__init__.py +1 -1
- sqlcg/indexer/indexer.py +61 -9
- sqlcg/parsers/base.py +193 -18
- sqlcg/parsers/bigquery_parser.py +5 -1
- sqlcg/parsers/dynamic_name.py +170 -0
- {sql_code_graph-1.35.1.dist-info → sql_code_graph-1.35.3.dist-info}/WHEEL +0 -0
- {sql_code_graph-1.35.1.dist-info → sql_code_graph-1.35.3.dist-info}/entry_points.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: sql-code-graph
|
|
3
|
-
Version: 1.35.
|
|
3
|
+
Version: 1.35.3
|
|
4
4
|
Summary: SQL code graph analyzer and lineage tracer
|
|
5
5
|
Project-URL: Homepage, https://github.com/Warhorze/sql-code-graph
|
|
6
6
|
Project-URL: Repository, https://github.com/Warhorze/sql-code-graph
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
sqlcg/__init__.py,sha256=
|
|
1
|
+
sqlcg/__init__.py,sha256=iuiB1QBl9EVtoW0aNt89z6gr_yDaZ5RV7phaFO8zX0Y,116
|
|
2
2
|
sqlcg/__main__.py,sha256=1YoFLcqEgTwYq1J3TbUwpkdG0zeeLIf2fJvwWI-CLFU,109
|
|
3
3
|
sqlcg/cli/__init__.py,sha256=W8fD0LpMq2xm_5WKGNMvJh2WBL1ho5E8hUeAqXQYT1g,28
|
|
4
4
|
sqlcg/cli/coverage.py,sha256=Xm9ITzZDHv2mJ70Q5jCacVuhDStVrE3gq12_-Ypvtd8,43823
|
|
@@ -34,7 +34,7 @@ sqlcg/indexer/__init__.py,sha256=Wh20Unz2OHs1oIyWLrpurPAasF0BET2g4iXtNk7mh2U,56
|
|
|
34
34
|
sqlcg/indexer/dbt_adapter.py,sha256=EB5x1WU5Z9d-I97ADDj88S_hG1C4z4nbrv8JUCzXfy8,686
|
|
35
35
|
sqlcg/indexer/error_classify.py,sha256=-sp8cRmuOBHu_CxnCtaXf34YxHFYwIFNjIrn4LaEv6M,7142
|
|
36
36
|
sqlcg/indexer/git_delta.py,sha256=zYdH5q-jV7w_ne8Oxdywsy0N3rwUjpd5RjEDurlrMSA,5026
|
|
37
|
-
sqlcg/indexer/indexer.py,sha256=
|
|
37
|
+
sqlcg/indexer/indexer.py,sha256=LNc5pI3_WwIMT2iTnCu9VB1Kk21T1qAmN7fupyiQu2s,105315
|
|
38
38
|
sqlcg/indexer/pool.py,sha256=iMmCQtpDRKBTQBep2_EUq9THcsE18Zgk0hdaFB_CwiA,19006
|
|
39
39
|
sqlcg/indexer/walker.py,sha256=Cft6JiJtdBFy0HR6L9pJdr5Fg0eRR3XBW1OMtM2apto,1947
|
|
40
40
|
sqlcg/indexer/watcher.py,sha256=mJQq1LASRLKKwhz0WhCUWPLLqyPR2_-FD_8efYU6gE8,8442
|
|
@@ -45,8 +45,9 @@ sqlcg/metrics/__init__.py,sha256=hLJ6wm4St8qqYwKh3o9QG7lcEt1BEYM31ccqO9tGpIg,133
|
|
|
45
45
|
sqlcg/metrics/store.py,sha256=KuDtxvyAgug9_KtiSCpvgKM2VZM7VSaI3D11uMLjJJk,10604
|
|
46
46
|
sqlcg/parsers/__init__.py,sha256=AamA8wBbDZV9_zEtZCI4Hyen5UAVKHmBwjTghTt2PZE,785
|
|
47
47
|
sqlcg/parsers/ansi_parser.py,sha256=RX6eVj7gt1qmsHNJLAF_a4jyW3RCI5W2oF4rd53cKNg,39336
|
|
48
|
-
sqlcg/parsers/base.py,sha256=
|
|
49
|
-
sqlcg/parsers/bigquery_parser.py,sha256=
|
|
48
|
+
sqlcg/parsers/base.py,sha256=d5s5_LSv96jrww9vx52GujjrLHwpxy_UOhmIlWcKglw,106489
|
|
49
|
+
sqlcg/parsers/bigquery_parser.py,sha256=g0B6aIpMyxLMVQ3ohAAjzR4nEmMh-WGkFcYLMiKdLxs,3177
|
|
50
|
+
sqlcg/parsers/dynamic_name.py,sha256=q0QAa9iAcmRW4e_0G2b2j-xTbI3VR1-Wwa-nJRLtrQw,6836
|
|
50
51
|
sqlcg/parsers/postgres_parser.py,sha256=lYfUpQY6j4Qm7ndXBtXbgPoGzYqYddWt5YeFnWKdA6I,946
|
|
51
52
|
sqlcg/parsers/registry.py,sha256=LXy1F6rqQI6VdxpRvZg_tNpoEucW3mXZHYBMlMONbX4,1496
|
|
52
53
|
sqlcg/parsers/snowflake_parser.py,sha256=cv7bzBm6Wmwa8uY41Y59ebfFjnP1Gk0Sjp2KN_QBGD8,47542
|
|
@@ -72,7 +73,7 @@ sqlcg/viz/render.py,sha256=BINkGbJbbb_iqhrkN795RaQsdg8nqCiJtsEFF1yo22Y,2737
|
|
|
72
73
|
sqlcg/viz/tags.py,sha256=6zRnGlHjuGmEeB6yN1uhzm8rqL7ZGoyL1Ki7jI5oM6A,5368
|
|
73
74
|
sqlcg/viz/assets/force-graph.min.js,sha256=jNdYdDdrYiUdUlElxRkolPBt30rstQk2q15Q32VVdzc,177272
|
|
74
75
|
sqlcg/viz/assets/template.html,sha256=9_j-mvo1ZxwgiJPDdVrNmca37dTrTjjYVd3977u-DxE,12294
|
|
75
|
-
sql_code_graph-1.35.
|
|
76
|
-
sql_code_graph-1.35.
|
|
77
|
-
sql_code_graph-1.35.
|
|
78
|
-
sql_code_graph-1.35.
|
|
76
|
+
sql_code_graph-1.35.3.dist-info/METADATA,sha256=bR0GUYuujbDEYNj4602aE5Olejev4X6hp7KYlaezZjg,17791
|
|
77
|
+
sql_code_graph-1.35.3.dist-info/WHEEL,sha256=mffPy8wBnZQn2VnJUU5jE99KsxaSfiyMHV9Yt0aLVxs,87
|
|
78
|
+
sql_code_graph-1.35.3.dist-info/entry_points.txt,sha256=Wfe49sVzV9p4eVFGo5RxcV-frr3HOP0yzzst8JBxQLQ,46
|
|
79
|
+
sql_code_graph-1.35.3.dist-info/RECORD,,
|
sqlcg/__init__.py
CHANGED
sqlcg/indexer/indexer.py
CHANGED
|
@@ -388,7 +388,7 @@ def _flush_row_batch(
|
|
|
388
388
|
)
|
|
389
389
|
|
|
390
390
|
|
|
391
|
-
def _subprocess_parse_worker(parser_cls, dialect, path, sql, q):
|
|
391
|
+
def _subprocess_parse_worker(parser_cls, dialect, path, sql, q, rel_path=None):
|
|
392
392
|
"""Parse a single file in a subprocess; queue the ParsedFile (or exception).
|
|
393
393
|
|
|
394
394
|
parser_cls must be the *class* (pickleable), not an instance. The worker
|
|
@@ -398,10 +398,16 @@ def _subprocess_parse_worker(parser_cls, dialect, path, sql, q):
|
|
|
398
398
|
T-09-04: Parser constructors require a SchemaResolver. The subprocess gets a
|
|
399
399
|
fresh empty resolver; column resolution runs in infer-only mode, the same as
|
|
400
400
|
small-repo mode.
|
|
401
|
+
|
|
402
|
+
#171: rel_path is the repo-relative posix path used for CTE/temp namespace
|
|
403
|
+
keying. It MUST be forwarded so the incremental path produces the same keys
|
|
404
|
+
as index_repo (which threads rel_path through its task dict); without it the
|
|
405
|
+
namespace falls back to the absolute OS path, creating duplicate CTE/temp
|
|
406
|
+
nodes after an incremental reindex.
|
|
401
407
|
"""
|
|
402
408
|
try:
|
|
403
409
|
parser = parser_cls(SchemaResolver(dialect=str(dialect) if dialect else None))
|
|
404
|
-
out = parser.parse_file(path, sql)
|
|
410
|
+
out = parser.parse_file(path, sql, rel_path=rel_path)
|
|
405
411
|
q.put(out)
|
|
406
412
|
except BaseException as exc:
|
|
407
413
|
# Send the exception back; parent will re-raise.
|
|
@@ -1129,6 +1135,22 @@ class Indexer:
|
|
|
1129
1135
|
schema_resolver = SchemaResolver(dialect=dialect)
|
|
1130
1136
|
parser = get_parser(dialect, schema_resolver)
|
|
1131
1137
|
|
|
1138
|
+
# #170/#171: load schema_aliases and compute repo-relative posix paths the
|
|
1139
|
+
# SAME way index_repo does, so the incremental path applies the same alias
|
|
1140
|
+
# normalisation (#170) and CTE/temp namespace keys (#171). Without these,
|
|
1141
|
+
# a branch-switch resync produced phantom *_tmp.* nodes (aliases unapplied)
|
|
1142
|
+
# and duplicate CTE/temp nodes (absolute-path keys).
|
|
1143
|
+
from sqlcg.core.config import get_schema_aliases
|
|
1144
|
+
|
|
1145
|
+
schema_aliases = get_schema_aliases(root)
|
|
1146
|
+
root_resolved = Path(root).resolve()
|
|
1147
|
+
|
|
1148
|
+
def _rel_posix(fp: Path) -> str:
|
|
1149
|
+
try:
|
|
1150
|
+
return fp.resolve().relative_to(root_resolved).as_posix()
|
|
1151
|
+
except ValueError:
|
|
1152
|
+
return fp.as_posix()
|
|
1153
|
+
|
|
1132
1154
|
pass1_results: list[ParsedFile] = []
|
|
1133
1155
|
for file_path in reparse_set:
|
|
1134
1156
|
try:
|
|
@@ -1141,7 +1163,9 @@ class Indexer:
|
|
|
1141
1163
|
pass1_results.append(placeholder)
|
|
1142
1164
|
continue
|
|
1143
1165
|
try:
|
|
1144
|
-
parsed = self._index_single_file(
|
|
1166
|
+
parsed = self._index_single_file(
|
|
1167
|
+
parser, file_path, sql, timeout_per_file, rel_path=_rel_posix(file_path)
|
|
1168
|
+
)
|
|
1145
1169
|
except Exception as exc:
|
|
1146
1170
|
logger.warning("resync_changed: parse failed %s: %s", file_path, exc)
|
|
1147
1171
|
parsed = ParsedFile(path=file_path, dialect=dialect)
|
|
@@ -1287,7 +1311,11 @@ class Indexer:
|
|
|
1287
1311
|
def_path = Path(definer_fp)
|
|
1288
1312
|
def_sql = def_path.read_text(encoding="utf-8")
|
|
1289
1313
|
def_parsed = self._index_single_file(
|
|
1290
|
-
parser,
|
|
1314
|
+
parser,
|
|
1315
|
+
def_path,
|
|
1316
|
+
def_sql,
|
|
1317
|
+
timeout_per_file,
|
|
1318
|
+
rel_path=_rel_posix(def_path),
|
|
1291
1319
|
)
|
|
1292
1320
|
# Harvest only — register for cross_file_sources but do NOT upsert
|
|
1293
1321
|
aggregator.register_pass1(def_parsed)
|
|
@@ -1315,7 +1343,7 @@ class Indexer:
|
|
|
1315
1343
|
)
|
|
1316
1344
|
continue
|
|
1317
1345
|
try:
|
|
1318
|
-
cl_parsed = parser.parse_file(cl_path, cl_sql)
|
|
1346
|
+
cl_parsed = parser.parse_file(cl_path, cl_sql, rel_path=_rel_posix(cl_path))
|
|
1319
1347
|
except Exception as exc:
|
|
1320
1348
|
logger.warning("resync_changed: parse failed for closure file %s: %s", cl_path, exc)
|
|
1321
1349
|
cl_parsed = ParsedFile(path=cl_path, dialect=dialect)
|
|
@@ -1325,7 +1353,19 @@ class Indexer:
|
|
|
1325
1353
|
# ---- Step 7: Batched bulk upsert (same _flush_batch path as index_repo) ----
|
|
1326
1354
|
all_results = pass1_results + closure_results
|
|
1327
1355
|
|
|
1328
|
-
#
|
|
1356
|
+
# #170: key-normalisation choke point — apply schema_aliases + empty-identity
|
|
1357
|
+
# guard to EVERY parse result BEFORE the defined_table_registry is built and
|
|
1358
|
+
# before _upsert_file_batch. index_repo (line ~797) and reindex_file
|
|
1359
|
+
# (line ~1421) both do this; resync_changed previously did not, so a
|
|
1360
|
+
# branch-switch incremental reindex left staging-alias schemas (e.g. ba_tmp)
|
|
1361
|
+
# un-normalised, producing phantom *_tmp.* nodes that a from-scratch index
|
|
1362
|
+
# never creates. O(edges) per file, once per resync — outside the hot loop.
|
|
1363
|
+
from sqlcg.parsers.base import normalize_keys as _normalize_keys
|
|
1364
|
+
|
|
1365
|
+
for pf in all_results:
|
|
1366
|
+
_normalize_keys(pf, schema_aliases)
|
|
1367
|
+
|
|
1368
|
+
# Build a registry for duplicate DDL detection (post-normalisation full_ids)
|
|
1329
1369
|
defined_table_registry: dict[str, str] = {}
|
|
1330
1370
|
for pf in all_results:
|
|
1331
1371
|
for table in pf.defined_tables:
|
|
@@ -1432,7 +1472,14 @@ class Indexer:
|
|
|
1432
1472
|
# join-column edges until the next full index (plan-review BLOCKER).
|
|
1433
1473
|
self._resolve_join_columns(db)
|
|
1434
1474
|
|
|
1435
|
-
def _index_single_file(
|
|
1475
|
+
def _index_single_file(
|
|
1476
|
+
self,
|
|
1477
|
+
parser,
|
|
1478
|
+
path: Path,
|
|
1479
|
+
sql: str,
|
|
1480
|
+
timeout: int,
|
|
1481
|
+
rel_path: str | None = None,
|
|
1482
|
+
) -> ParsedFile:
|
|
1436
1483
|
"""Parse one file, with optional timeout via subprocess isolation.
|
|
1437
1484
|
|
|
1438
1485
|
T-09-04: Subprocess isolation via multiprocessing.Process + spawn context.
|
|
@@ -1446,12 +1493,17 @@ class Indexer:
|
|
|
1446
1493
|
path: Path to the file
|
|
1447
1494
|
sql: SQL text
|
|
1448
1495
|
timeout: Timeout in seconds (0 = no timeout)
|
|
1496
|
+
rel_path: Repo-relative posix path for CTE/temp namespace keying
|
|
1497
|
+
(#171). Threaded through to parse_file (both the in-process and
|
|
1498
|
+
subprocess branch) so the incremental path produces the same
|
|
1499
|
+
namespace keys as index_repo. Falls back to str(path) inside the
|
|
1500
|
+
parser when None.
|
|
1449
1501
|
|
|
1450
1502
|
Returns:
|
|
1451
1503
|
ParsedFile with parse_failed flag set if timeout occurs
|
|
1452
1504
|
"""
|
|
1453
1505
|
if timeout <= 0:
|
|
1454
|
-
return parser.parse_file(path, sql)
|
|
1506
|
+
return parser.parse_file(path, sql, rel_path=rel_path)
|
|
1455
1507
|
|
|
1456
1508
|
ctx = mp.get_context("spawn") # avoid fork-inherit pitfalls (KuzuDB connection FD etc.)
|
|
1457
1509
|
# Unbounded queue: the child writes one large ParsedFile (192–552 KB pickled).
|
|
@@ -1462,7 +1514,7 @@ class Indexer:
|
|
|
1462
1514
|
q: mp.Queue = ctx.Queue()
|
|
1463
1515
|
proc = ctx.Process(
|
|
1464
1516
|
target=_subprocess_parse_worker,
|
|
1465
|
-
args=(parser.__class__, parser.DIALECT, path, sql, q),
|
|
1517
|
+
args=(parser.__class__, parser.DIALECT, path, sql, q, rel_path),
|
|
1466
1518
|
daemon=True,
|
|
1467
1519
|
)
|
|
1468
1520
|
proc.start()
|
sqlcg/parsers/base.py
CHANGED
|
@@ -637,12 +637,14 @@ class SqlParser(ABC):
|
|
|
637
637
|
"""Return True if `_extract_column_lineage` would attempt extraction for `stmt`.
|
|
638
638
|
|
|
639
639
|
Mirrors the type/body checks `_extract_column_lineage` (below, this class)
|
|
640
|
-
already performs: only `exp.Select`, `exp.Insert` with a `SELECT` body,
|
|
641
|
-
`exp.Create` with a `Select`/`Subquery` body (CTAS / CREATE VIEW AS SELECT)
|
|
642
|
-
|
|
643
|
-
|
|
644
|
-
|
|
645
|
-
|
|
640
|
+
already performs: only `exp.Select`, `exp.Insert` with a `SELECT` body,
|
|
641
|
+
`exp.Create` with a `Select`/`Subquery` body (CTAS / CREATE VIEW AS SELECT),
|
|
642
|
+
and `exp.Merge` (column edges extracted structurally from its WHEN clauses —
|
|
643
|
+
see `_extract_merge_lineage`, plan/sprints/sprint_snowflake_lineage_patterns.md
|
|
644
|
+
PR-C) can ever produce column-lineage edges. Everything else (Update, Delete,
|
|
645
|
+
Use, Set, Comment, Drop, Alter, Command, TruncateTable, CREATE ...
|
|
646
|
+
LIKE/CLONE/column-defs, INSERT ... VALUES) is structurally lineage-free
|
|
647
|
+
regardless of `build_scope`'s outcome.
|
|
646
648
|
|
|
647
649
|
Used by `AnsiParser._parse_statement` to avoid marking `parse_failed=True`
|
|
648
650
|
for statement kinds that were never going to produce column lineage in the
|
|
@@ -658,8 +660,185 @@ class SqlParser(ABC):
|
|
|
658
660
|
return isinstance(stmt.expression, exp.Select)
|
|
659
661
|
if isinstance(stmt, exp.Create):
|
|
660
662
|
return isinstance(stmt.expression, (exp.Select, exp.Subquery))
|
|
663
|
+
if isinstance(stmt, exp.Merge):
|
|
664
|
+
return True
|
|
661
665
|
return False
|
|
662
666
|
|
|
667
|
+
def _extract_merge_lineage(
|
|
668
|
+
self,
|
|
669
|
+
stmt: Any,
|
|
670
|
+
dst_table: "TableRef | None",
|
|
671
|
+
query_sources: list["TableRef"] | None,
|
|
672
|
+
out: ParsedFile,
|
|
673
|
+
) -> list[LineageEdge]:
|
|
674
|
+
"""Extract column-level lineage from a MERGE statement's WHEN clauses.
|
|
675
|
+
|
|
676
|
+
MERGE column edges cannot come from sqlglot's ``lineage()`` (it does not model
|
|
677
|
+
MERGE branches), but the ``WHEN MATCHED ... UPDATE SET`` and
|
|
678
|
+
``WHEN NOT MATCHED ... INSERT (cols) VALUES (vals)`` clauses are structurally
|
|
679
|
+
extractable by a direct AST walk. This method performs that walk: it is bounded
|
|
680
|
+
and runs ONCE PER MERGE statement — it NEVER calls ``sg_lineage``, ``qualify``,
|
|
681
|
+
``build_scope`` or ``exp.expand`` (AC-C7,
|
|
682
|
+
plan/sprints/sprint_snowflake_lineage_patterns.md PR-C).
|
|
683
|
+
|
|
684
|
+
Resolution model:
|
|
685
|
+
- The MERGE target (``stmt.this``) and source (``stmt.args['using']``) are both
|
|
686
|
+
captured in ``query_sources`` as already-qualified TableRefs (the table-grain
|
|
687
|
+
path does NOT split them into target/sources for MERGE — ``dst_table`` is
|
|
688
|
+
``None`` here, so AC-C3's shipped table-grain behaviour is preserved). This
|
|
689
|
+
method matches ``stmt.this``/``using`` back to those refs by alias+name to
|
|
690
|
+
identify which is the target and which the source(s), then builds an
|
|
691
|
+
alias -> TableRef map over both.
|
|
692
|
+
- MATCHED UPDATE: each ``exp.EQ`` is ``target_col = source_expr``; the dst is a
|
|
693
|
+
column on the target, the src columns are every ``exp.Column`` in the RHS
|
|
694
|
+
(multi-source expressions emit one edge per source column — AC-C4). A
|
|
695
|
+
pure-literal RHS (no ``exp.Column``) contributes no edge (AC-C5) — mirrors the
|
|
696
|
+
pure-literal skip invariant.
|
|
697
|
+
- NOT MATCHED INSERT: ``then.this`` is the target column tuple, ``then.expression``
|
|
698
|
+
the values tuple; zip positionally and emit per source column. An INSERT with
|
|
699
|
+
no column list (``then.this is None``) is skip-and-logged
|
|
700
|
+
(``col_lineage_skip:merge_no_collist:``) — NO positional DDL fallback (gate
|
|
701
|
+
decision: positional guessing risks wrong edges).
|
|
702
|
+
|
|
703
|
+
Args:
|
|
704
|
+
stmt: the ``exp.Merge`` AST node.
|
|
705
|
+
dst_table: the resolved INSERT/CREATE target TableRef from the caller. For
|
|
706
|
+
MERGE this is ``None`` (the table-grain path leaves the target in
|
|
707
|
+
``query_sources``); the target is recovered here from ``stmt.this``.
|
|
708
|
+
query_sources: resolved table refs for the MERGE (target + ``USING``
|
|
709
|
+
source(s)), each carrying its ``.alias``, fully qualified.
|
|
710
|
+
out: ParsedFile, for skip/unresolved error records.
|
|
711
|
+
|
|
712
|
+
Returns:
|
|
713
|
+
List of LineageEdge (may be empty).
|
|
714
|
+
"""
|
|
715
|
+
import sqlglot.expressions as exp
|
|
716
|
+
|
|
717
|
+
edges: list[LineageEdge] = []
|
|
718
|
+
refs = list(query_sources or [])
|
|
719
|
+
if dst_table is not None:
|
|
720
|
+
refs.append(dst_table)
|
|
721
|
+
|
|
722
|
+
# Identify the MERGE target ref by matching stmt.this against the resolved refs.
|
|
723
|
+
# `_real_tables` does not preserve the AST alias on the TableRef, so match on the
|
|
724
|
+
# AST target's name (and alias, as a fallback) against each ref's name/alias.
|
|
725
|
+
# The remaining refs are the USING source(s).
|
|
726
|
+
target_ast = stmt.this if isinstance(stmt.this, exp.Table) else None
|
|
727
|
+
target_keys: set[str] = set()
|
|
728
|
+
if target_ast is not None:
|
|
729
|
+
if target_ast.name:
|
|
730
|
+
target_keys.add(target_ast.name.lower())
|
|
731
|
+
if target_ast.alias:
|
|
732
|
+
target_keys.add(target_ast.alias.lower())
|
|
733
|
+
|
|
734
|
+
def _ref_keys(ref: TableRef) -> set[str]:
|
|
735
|
+
keys: set[str] = set()
|
|
736
|
+
if ref.alias:
|
|
737
|
+
keys.add(ref.alias.lower())
|
|
738
|
+
if ref.name:
|
|
739
|
+
keys.add(ref.name.lower())
|
|
740
|
+
return keys
|
|
741
|
+
|
|
742
|
+
target_ref: TableRef | None = None
|
|
743
|
+
if target_keys:
|
|
744
|
+
for ref in refs:
|
|
745
|
+
if target_keys & _ref_keys(ref):
|
|
746
|
+
target_ref = ref
|
|
747
|
+
break
|
|
748
|
+
if target_ref is None:
|
|
749
|
+
label = ".".join(sorted(target_keys)) or "<unknown>"
|
|
750
|
+
out.errors.append(f"col_lineage_skip:merge_no_target:{label}")
|
|
751
|
+
return edges
|
|
752
|
+
|
|
753
|
+
source_refs = [r for r in refs if r is not target_ref]
|
|
754
|
+
|
|
755
|
+
# Build an alias/name -> TableRef map over the target + USING source(s).
|
|
756
|
+
# MERGE column references qualify by the table alias (e.g. `target.name`,
|
|
757
|
+
# `s.name`). `_real_tables` strips the alias from the resolved TableRef, so the
|
|
758
|
+
# alias is recovered from the AST nodes (target=stmt.this, source=using) and
|
|
759
|
+
# mapped to its resolved ref. Bare table names are registered as a fallback.
|
|
760
|
+
alias_map: dict[str, TableRef] = {}
|
|
761
|
+
|
|
762
|
+
def _register(ref: TableRef | None, ast_node: Any) -> None:
|
|
763
|
+
if ref is None:
|
|
764
|
+
return
|
|
765
|
+
if isinstance(ast_node, exp.Table) and ast_node.alias:
|
|
766
|
+
alias_map.setdefault(ast_node.alias.lower(), ref)
|
|
767
|
+
if ref.alias:
|
|
768
|
+
alias_map.setdefault(ref.alias.lower(), ref)
|
|
769
|
+
if ref.name:
|
|
770
|
+
alias_map.setdefault(ref.name.lower(), ref)
|
|
771
|
+
|
|
772
|
+
using_ast = stmt.args.get("using")
|
|
773
|
+
_register(target_ref, target_ast)
|
|
774
|
+
for src in source_refs:
|
|
775
|
+
_register(src, using_ast if len(source_refs) == 1 else None)
|
|
776
|
+
|
|
777
|
+
def _resolve_table(col: Any) -> TableRef | None:
|
|
778
|
+
"""Resolve the owning TableRef of a source column reference."""
|
|
779
|
+
tbl_name = col.table
|
|
780
|
+
if tbl_name:
|
|
781
|
+
return alias_map.get(tbl_name.lower())
|
|
782
|
+
# Unqualified RHS column — attribute to the lone source when unambiguous.
|
|
783
|
+
if len(source_refs) == 1:
|
|
784
|
+
return source_refs[0]
|
|
785
|
+
return None
|
|
786
|
+
|
|
787
|
+
def _emit(target_col_name: str, rhs: Any, transform: str) -> None:
|
|
788
|
+
"""Emit one edge per source ``exp.Column`` found in ``rhs``."""
|
|
789
|
+
if not target_col_name:
|
|
790
|
+
return
|
|
791
|
+
cols = list(rhs.find_all(exp.Column)) if rhs is not None else []
|
|
792
|
+
if not cols:
|
|
793
|
+
# Pure-literal / no source column — no edge (AC-C5).
|
|
794
|
+
return
|
|
795
|
+
dst_ref = ColumnRef(table=target_ref, name=target_col_name)
|
|
796
|
+
for col in cols:
|
|
797
|
+
src_table = _resolve_table(col)
|
|
798
|
+
if src_table is None:
|
|
799
|
+
out.errors.append(
|
|
800
|
+
f"col_lineage_skip:merge_unresolved:{col.table or ''}.{col.name}"
|
|
801
|
+
)
|
|
802
|
+
continue
|
|
803
|
+
edges.append(
|
|
804
|
+
LineageEdge(
|
|
805
|
+
src=ColumnRef(table=src_table, name=col.name),
|
|
806
|
+
dst=dst_ref,
|
|
807
|
+
transform=transform,
|
|
808
|
+
)
|
|
809
|
+
)
|
|
810
|
+
|
|
811
|
+
for when in stmt.find_all(exp.When):
|
|
812
|
+
then = when.args.get("then")
|
|
813
|
+
if isinstance(then, exp.Update):
|
|
814
|
+
# WHEN MATCHED THEN UPDATE SET target.a = s.a, target.b = s.x + s.y
|
|
815
|
+
for eq in then.expressions:
|
|
816
|
+
if not isinstance(eq, exp.EQ):
|
|
817
|
+
continue
|
|
818
|
+
lhs = eq.this
|
|
819
|
+
if not isinstance(lhs, exp.Column):
|
|
820
|
+
continue
|
|
821
|
+
_emit(lhs.name, eq.expression, "MERGE_UPDATE")
|
|
822
|
+
elif isinstance(then, exp.Insert):
|
|
823
|
+
# WHEN NOT MATCHED THEN INSERT (cols) VALUES (vals)
|
|
824
|
+
target_cols = then.this
|
|
825
|
+
values = then.expression
|
|
826
|
+
if target_cols is None:
|
|
827
|
+
# INSERT VALUES with no column list — skip-and-log, no positional
|
|
828
|
+
# DDL fallback (gate decision: positional guessing risks wrong edges).
|
|
829
|
+
out.errors.append(f"col_lineage_skip:merge_no_collist:{target_ref.full_id}")
|
|
830
|
+
continue
|
|
831
|
+
col_exprs = list(getattr(target_cols, "expressions", []) or [])
|
|
832
|
+
val_exprs = list(getattr(values, "expressions", []) or [])
|
|
833
|
+
# Positional alignment of (cols) with VALUES (...); a malformed MERGE
|
|
834
|
+
# with mismatched arities zips to the shorter list rather than raising.
|
|
835
|
+
for target_col, value in zip(col_exprs, val_exprs, strict=False):
|
|
836
|
+
if not isinstance(target_col, exp.Column):
|
|
837
|
+
continue
|
|
838
|
+
_emit(target_col.name, value, "MERGE_INSERT")
|
|
839
|
+
|
|
840
|
+
return edges
|
|
841
|
+
|
|
663
842
|
def _real_tables(self, scope: Any) -> list[TableRef]:
|
|
664
843
|
"""Return real (non-CTE) tables referenced in a scope.
|
|
665
844
|
|
|
@@ -1117,19 +1296,15 @@ class SqlParser(ABC):
|
|
|
1117
1296
|
join_col_resolves: list[JoinColResolve] = []
|
|
1118
1297
|
_qualify_failed: bool = False
|
|
1119
1298
|
|
|
1120
|
-
#
|
|
1121
|
-
# sqlglot's lineage()
|
|
1122
|
-
#
|
|
1123
|
-
#
|
|
1299
|
+
# MERGE column lineage (un-defer T-07-06,
|
|
1300
|
+
# plan/sprints/sprint_snowflake_lineage_patterns.md PR-C). sqlglot's lineage()
|
|
1301
|
+
# API does not handle MERGE branches, but the WHEN MATCHED UPDATE SET / WHEN NOT
|
|
1302
|
+
# MATCHED INSERT VALUES clauses are structurally extractable by a direct AST walk.
|
|
1303
|
+
# This is a bounded once-per-statement path that NEVER calls sg_lineage / qualify /
|
|
1304
|
+
# build_scope / exp.expand (AC-C7).
|
|
1124
1305
|
if isinstance(stmt, exp.Merge):
|
|
1125
|
-
|
|
1126
|
-
|
|
1127
|
-
try:
|
|
1128
|
-
dst_name = stmt.this.name
|
|
1129
|
-
except Exception:
|
|
1130
|
-
dst_name = None
|
|
1131
|
-
out.errors.append(f"col_lineage_skip:merge_branch:{dst_name or '<unknown>'}")
|
|
1132
|
-
return LineageExtraction(edges=edges, star_sources=star_sources)
|
|
1306
|
+
merge_edges = self._extract_merge_lineage(stmt, dst_table, query_sources, out)
|
|
1307
|
+
return LineageExtraction(edges=merge_edges, star_sources=star_sources)
|
|
1133
1308
|
|
|
1134
1309
|
# Only extract column lineage for certain statement types
|
|
1135
1310
|
if not isinstance(stmt, (exp.Select, exp.Insert, exp.Create)):
|
sqlcg/parsers/bigquery_parser.py
CHANGED
|
@@ -35,12 +35,16 @@ class BigQueryParser(AnsiParser):
|
|
|
35
35
|
"""
|
|
36
36
|
super().__init__(schema_resolver, schema_aliases=schema_aliases)
|
|
37
37
|
|
|
38
|
-
def parse_file(self, path: Path, sql: str) -> ParsedFile:
|
|
38
|
+
def parse_file(self, path: Path, sql: str, rel_path: str | None = None) -> ParsedFile:
|
|
39
39
|
"""Parse BigQuery SQL file with scripting block detection.
|
|
40
40
|
|
|
41
41
|
Args:
|
|
42
42
|
path: Path to the source file
|
|
43
43
|
sql: SQL text to parse
|
|
44
|
+
rel_path: Repo-relative posix path for CTE/temp namespace keying.
|
|
45
|
+
Accepted for signature parity with the other parsers (the pool and
|
|
46
|
+
resync paths always pass it); scripting-fallback BigQuery files do
|
|
47
|
+
not register CTE/temp nodes, so it is currently unused here.
|
|
44
48
|
|
|
45
49
|
Returns:
|
|
46
50
|
ParsedFile with parsed statements and metadata
|
|
@@ -0,0 +1,170 @@
|
|
|
1
|
+
"""Dialect-agnostic bounded constant-fold of dynamic table-name expressions.
|
|
2
|
+
|
|
3
|
+
This is the pure core of the "generic variable-name resolution" feature
|
|
4
|
+
(``plan/sprints/feature_generic_var_name_resolution.md``). It partially folds a
|
|
5
|
+
string-valued expression AST (string literals, ``||`` / ``CONCAT``, and 1-hop
|
|
6
|
+
``$var`` references) into a concrete ``[catalog.]db.name`` table reference,
|
|
7
|
+
recovering the statically-determined trailing identifier components while
|
|
8
|
+
honestly giving up when the name cannot be determined.
|
|
9
|
+
|
|
10
|
+
No parser/indexer state is touched here: ``resolve_dynamic_name`` is a pure
|
|
11
|
+
function, fully unit-testable in isolation. PR-2 wires it into the Snowflake
|
|
12
|
+
parser via a sink predicate; this module ships unwired.
|
|
13
|
+
|
|
14
|
+
Fold classification:
|
|
15
|
+
|
|
16
|
+
* **LIT (resolvable):** ``exp.Literal`` (string), ``exp.DPipe`` / ``exp.Concat``
|
|
17
|
+
over folds, and a 1-hop ``$var`` whose RHS (looked up in ``var_env`` exactly
|
|
18
|
+
once) itself folds to a leading constant prefix.
|
|
19
|
+
* **OPAQUE (unresolvable):** runtime functions (``current_database()``,
|
|
20
|
+
``split_part(...)``, ``current_warehouse()``, any ``exp.Func`` /
|
|
21
|
+
``exp.Anonymous`` that is not a pure string op), scalar subqueries that are not
|
|
22
|
+
a single-projection of folds, bind parameters (``exp.Placeholder``), and any
|
|
23
|
+
``$var`` reference at chain depth >= 2.
|
|
24
|
+
|
|
25
|
+
Name extraction interprets the fold as ``[catalog.]db.name`` (name-last,
|
|
26
|
+
dot-split): it keeps the rightmost statically-determined ``db.name`` tail,
|
|
27
|
+
wildcards (drops) a leading OPAQUE catalog, keeps a static *literal* catalog, and
|
|
28
|
+
returns ``None`` when the whole thing is opaque / the tail is a bare name / the
|
|
29
|
+
tail lacks a resolvable ``db.name``.
|
|
30
|
+
"""
|
|
31
|
+
|
|
32
|
+
from __future__ import annotations
|
|
33
|
+
|
|
34
|
+
from dataclasses import dataclass
|
|
35
|
+
|
|
36
|
+
import sqlglot.expressions as exp
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
@dataclass(frozen=True)
|
|
40
|
+
class _Lit:
|
|
41
|
+
"""A statically-known string segment of the folded expression."""
|
|
42
|
+
|
|
43
|
+
text: str
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
@dataclass(frozen=True)
|
|
47
|
+
class _Opaque:
|
|
48
|
+
"""A runtime-determined segment that cannot be folded statically."""
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
_Part = _Lit | _Opaque
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def _fold_parts(
|
|
55
|
+
node: exp.Expression, # type: ignore[attr-defined]
|
|
56
|
+
var_env: dict[str, exp.Expression], # type: ignore[attr-defined]
|
|
57
|
+
*,
|
|
58
|
+
chain_depth: int,
|
|
59
|
+
) -> list[_Part]:
|
|
60
|
+
"""Fold ``node`` into an ordered list of LIT / OPAQUE parts.
|
|
61
|
+
|
|
62
|
+
Descends string-concatenation structure (``||`` / ``CONCAT``), single-
|
|
63
|
+
projection subqueries and parentheses; resolves 1-hop ``$var`` references
|
|
64
|
+
against ``var_env``; classifies everything else OPAQUE.
|
|
65
|
+
"""
|
|
66
|
+
# Unwrap a single-projection (SELECT ...) scalar subquery / paren.
|
|
67
|
+
if isinstance(node, exp.Subquery):
|
|
68
|
+
return _fold_parts(node.this, var_env, chain_depth=chain_depth)
|
|
69
|
+
if isinstance(node, exp.Paren):
|
|
70
|
+
return _fold_parts(node.this, var_env, chain_depth=chain_depth)
|
|
71
|
+
if isinstance(node, exp.Select):
|
|
72
|
+
projections = node.expressions
|
|
73
|
+
if len(projections) != 1:
|
|
74
|
+
return [_Opaque()]
|
|
75
|
+
return _fold_parts(projections[0], var_env, chain_depth=chain_depth)
|
|
76
|
+
|
|
77
|
+
# String concatenation: DPipe is binary (left-nested), Concat is n-ary.
|
|
78
|
+
if isinstance(node, exp.DPipe):
|
|
79
|
+
return _fold_parts(node.this, var_env, chain_depth=chain_depth) + _fold_parts(
|
|
80
|
+
node.expression, var_env, chain_depth=chain_depth
|
|
81
|
+
)
|
|
82
|
+
if isinstance(node, exp.Concat):
|
|
83
|
+
parts: list[_Part] = []
|
|
84
|
+
for child in node.expressions:
|
|
85
|
+
parts.extend(_fold_parts(child, var_env, chain_depth=chain_depth))
|
|
86
|
+
return parts
|
|
87
|
+
|
|
88
|
+
# Concrete string literal.
|
|
89
|
+
if isinstance(node, exp.Literal) and node.is_string:
|
|
90
|
+
return [_Lit(node.this)]
|
|
91
|
+
|
|
92
|
+
# 1-hop $var reference.
|
|
93
|
+
if isinstance(node, exp.Parameter):
|
|
94
|
+
var = node.this
|
|
95
|
+
name = var.name if isinstance(var, exp.Var) else None
|
|
96
|
+
if name is None or chain_depth < 1:
|
|
97
|
+
return [_Opaque()]
|
|
98
|
+
rhs = var_env.get(name.lower())
|
|
99
|
+
if rhs is None:
|
|
100
|
+
return [_Opaque()]
|
|
101
|
+
# Resolve exactly one hop: the looked-up RHS may not itself follow
|
|
102
|
+
# further $vars (depth-2 is OPAQUE because chain_depth drops to 0).
|
|
103
|
+
return _fold_parts(rhs, var_env, chain_depth=chain_depth - 1)
|
|
104
|
+
|
|
105
|
+
# Everything else (runtime funcs, bind params, non-fold subqueries) is opaque.
|
|
106
|
+
return [_Opaque()]
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
def resolve_dynamic_name(
|
|
110
|
+
rhs_expr: exp.Expression, # type: ignore[attr-defined]
|
|
111
|
+
var_env: dict[str, exp.Expression], # type: ignore[attr-defined]
|
|
112
|
+
*,
|
|
113
|
+
chain_depth: int = 1,
|
|
114
|
+
) -> exp.Table | None:
|
|
115
|
+
"""Bounded partial constant-fold of a string-expression AST into a TableRef.
|
|
116
|
+
|
|
117
|
+
Args:
|
|
118
|
+
rhs_expr: the RHS expression assigned to the dynamic name (the AST that
|
|
119
|
+
the ``IDENTIFIER($var)`` sink dereferences).
|
|
120
|
+
var_env: lowercased var name -> its RHS AST, for 1-hop chain resolution.
|
|
121
|
+
chain_depth: max var-lookup hops (default 1; corpus max is 1). A var
|
|
122
|
+
whose value references another var resolves only the first hop.
|
|
123
|
+
|
|
124
|
+
Returns:
|
|
125
|
+
An ``exp.Table`` for the resolvable trailing identifier components, or
|
|
126
|
+
``None`` to give up honestly (caller leaves the sink dropped).
|
|
127
|
+
"""
|
|
128
|
+
parts = _fold_parts(rhs_expr, var_env, chain_depth=chain_depth)
|
|
129
|
+
|
|
130
|
+
if not parts:
|
|
131
|
+
return None
|
|
132
|
+
|
|
133
|
+
# Take the rightmost contiguous run of LIT parts (the static suffix), noting
|
|
134
|
+
# whether an OPAQUE segment sits immediately before that run (catalog
|
|
135
|
+
# position is then runtime-determined).
|
|
136
|
+
suffix_lits: list[str] = []
|
|
137
|
+
opaque_precedes_suffix = False
|
|
138
|
+
for part in reversed(parts):
|
|
139
|
+
if isinstance(part, _Lit):
|
|
140
|
+
suffix_lits.append(part.text)
|
|
141
|
+
else:
|
|
142
|
+
opaque_precedes_suffix = True
|
|
143
|
+
break
|
|
144
|
+
suffix_lits.reverse()
|
|
145
|
+
|
|
146
|
+
if not suffix_lits:
|
|
147
|
+
# All-opaque (no static tail at all).
|
|
148
|
+
return None
|
|
149
|
+
|
|
150
|
+
suffix = "".join(suffix_lits)
|
|
151
|
+
table = exp.to_table(suffix, dialect="snowflake")
|
|
152
|
+
|
|
153
|
+
# A resolvable object id needs at least db.name (schema.table). A bare name
|
|
154
|
+
# (no db) is a give-up: we never guess the schema/table. A missing name (tail
|
|
155
|
+
# ends on a dot) is likewise unresolvable.
|
|
156
|
+
if table.db == "" or table.name == "":
|
|
157
|
+
return None
|
|
158
|
+
|
|
159
|
+
# Catalog policy:
|
|
160
|
+
# - literal catalog present in the static suffix (3-part name) -> KEEP it.
|
|
161
|
+
# - an OPAQUE segment precedes the suffix (the catalog slot came from a
|
|
162
|
+
# runtime segment) -> wildcard / drop the catalog (emit catalog-less
|
|
163
|
+
# db.name) so the node merges with DDL / plain-SQL refs to the same table.
|
|
164
|
+
# to_table already yields catalog='' when the static suffix had no catalog
|
|
165
|
+
# component (e.g. '.EMP.APPLICATION_FIELDS' or 'DHB.KOSTEN'); the explicit
|
|
166
|
+
# drop only matters when an opaque prefix sits before a literal 3-part tail.
|
|
167
|
+
if opaque_precedes_suffix and table.catalog != "":
|
|
168
|
+
table.set("catalog", None)
|
|
169
|
+
|
|
170
|
+
return table
|
|
File without changes
|
|
File without changes
|