sql-code-graph 1.35.1__py3-none-any.whl → 1.35.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: sql-code-graph
3
- Version: 1.35.1
3
+ Version: 1.35.3
4
4
  Summary: SQL code graph analyzer and lineage tracer
5
5
  Project-URL: Homepage, https://github.com/Warhorze/sql-code-graph
6
6
  Project-URL: Repository, https://github.com/Warhorze/sql-code-graph
@@ -1,4 +1,4 @@
1
- sqlcg/__init__.py,sha256=VahDFz44BfVysbx4bUBilcYdqMPT5hNDCMqoIkgonw0,116
1
+ sqlcg/__init__.py,sha256=iuiB1QBl9EVtoW0aNt89z6gr_yDaZ5RV7phaFO8zX0Y,116
2
2
  sqlcg/__main__.py,sha256=1YoFLcqEgTwYq1J3TbUwpkdG0zeeLIf2fJvwWI-CLFU,109
3
3
  sqlcg/cli/__init__.py,sha256=W8fD0LpMq2xm_5WKGNMvJh2WBL1ho5E8hUeAqXQYT1g,28
4
4
  sqlcg/cli/coverage.py,sha256=Xm9ITzZDHv2mJ70Q5jCacVuhDStVrE3gq12_-Ypvtd8,43823
@@ -34,7 +34,7 @@ sqlcg/indexer/__init__.py,sha256=Wh20Unz2OHs1oIyWLrpurPAasF0BET2g4iXtNk7mh2U,56
34
34
  sqlcg/indexer/dbt_adapter.py,sha256=EB5x1WU5Z9d-I97ADDj88S_hG1C4z4nbrv8JUCzXfy8,686
35
35
  sqlcg/indexer/error_classify.py,sha256=-sp8cRmuOBHu_CxnCtaXf34YxHFYwIFNjIrn4LaEv6M,7142
36
36
  sqlcg/indexer/git_delta.py,sha256=zYdH5q-jV7w_ne8Oxdywsy0N3rwUjpd5RjEDurlrMSA,5026
37
- sqlcg/indexer/indexer.py,sha256=u6fIhgLfBs9fKJoAyO4hJrm9152YMe08EHycncctQx8,102697
37
+ sqlcg/indexer/indexer.py,sha256=LNc5pI3_WwIMT2iTnCu9VB1Kk21T1qAmN7fupyiQu2s,105315
38
38
  sqlcg/indexer/pool.py,sha256=iMmCQtpDRKBTQBep2_EUq9THcsE18Zgk0hdaFB_CwiA,19006
39
39
  sqlcg/indexer/walker.py,sha256=Cft6JiJtdBFy0HR6L9pJdr5Fg0eRR3XBW1OMtM2apto,1947
40
40
  sqlcg/indexer/watcher.py,sha256=mJQq1LASRLKKwhz0WhCUWPLLqyPR2_-FD_8efYU6gE8,8442
@@ -45,8 +45,9 @@ sqlcg/metrics/__init__.py,sha256=hLJ6wm4St8qqYwKh3o9QG7lcEt1BEYM31ccqO9tGpIg,133
45
45
  sqlcg/metrics/store.py,sha256=KuDtxvyAgug9_KtiSCpvgKM2VZM7VSaI3D11uMLjJJk,10604
46
46
  sqlcg/parsers/__init__.py,sha256=AamA8wBbDZV9_zEtZCI4Hyen5UAVKHmBwjTghTt2PZE,785
47
47
  sqlcg/parsers/ansi_parser.py,sha256=RX6eVj7gt1qmsHNJLAF_a4jyW3RCI5W2oF4rd53cKNg,39336
48
- sqlcg/parsers/base.py,sha256=Q2X_qN7DmGFU2oA6Z9fcxyjpkeb6lOsia4BcwPvwNLo,97806
49
- sqlcg/parsers/bigquery_parser.py,sha256=mOnWTfXB_Dp4JwFE1PVYOB6CDPf5nYE0Dea8kJCl9uQ,2827
48
+ sqlcg/parsers/base.py,sha256=d5s5_LSv96jrww9vx52GujjrLHwpxy_UOhmIlWcKglw,106489
49
+ sqlcg/parsers/bigquery_parser.py,sha256=g0B6aIpMyxLMVQ3ohAAjzR4nEmMh-WGkFcYLMiKdLxs,3177
50
+ sqlcg/parsers/dynamic_name.py,sha256=q0QAa9iAcmRW4e_0G2b2j-xTbI3VR1-Wwa-nJRLtrQw,6836
50
51
  sqlcg/parsers/postgres_parser.py,sha256=lYfUpQY6j4Qm7ndXBtXbgPoGzYqYddWt5YeFnWKdA6I,946
51
52
  sqlcg/parsers/registry.py,sha256=LXy1F6rqQI6VdxpRvZg_tNpoEucW3mXZHYBMlMONbX4,1496
52
53
  sqlcg/parsers/snowflake_parser.py,sha256=cv7bzBm6Wmwa8uY41Y59ebfFjnP1Gk0Sjp2KN_QBGD8,47542
@@ -72,7 +73,7 @@ sqlcg/viz/render.py,sha256=BINkGbJbbb_iqhrkN795RaQsdg8nqCiJtsEFF1yo22Y,2737
72
73
  sqlcg/viz/tags.py,sha256=6zRnGlHjuGmEeB6yN1uhzm8rqL7ZGoyL1Ki7jI5oM6A,5368
73
74
  sqlcg/viz/assets/force-graph.min.js,sha256=jNdYdDdrYiUdUlElxRkolPBt30rstQk2q15Q32VVdzc,177272
74
75
  sqlcg/viz/assets/template.html,sha256=9_j-mvo1ZxwgiJPDdVrNmca37dTrTjjYVd3977u-DxE,12294
75
- sql_code_graph-1.35.1.dist-info/METADATA,sha256=YOc2OzDrdadgZdpmY3QhwueUwScKj39svzKO4dgd9Q0,17791
76
- sql_code_graph-1.35.1.dist-info/WHEEL,sha256=mffPy8wBnZQn2VnJUU5jE99KsxaSfiyMHV9Yt0aLVxs,87
77
- sql_code_graph-1.35.1.dist-info/entry_points.txt,sha256=Wfe49sVzV9p4eVFGo5RxcV-frr3HOP0yzzst8JBxQLQ,46
78
- sql_code_graph-1.35.1.dist-info/RECORD,,
76
+ sql_code_graph-1.35.3.dist-info/METADATA,sha256=bR0GUYuujbDEYNj4602aE5Olejev4X6hp7KYlaezZjg,17791
77
+ sql_code_graph-1.35.3.dist-info/WHEEL,sha256=mffPy8wBnZQn2VnJUU5jE99KsxaSfiyMHV9Yt0aLVxs,87
78
+ sql_code_graph-1.35.3.dist-info/entry_points.txt,sha256=Wfe49sVzV9p4eVFGo5RxcV-frr3HOP0yzzst8JBxQLQ,46
79
+ sql_code_graph-1.35.3.dist-info/RECORD,,
sqlcg/__init__.py CHANGED
@@ -1,5 +1,5 @@
1
1
  """SQL Code Graph - SQL lineage and dependency analysis tool."""
2
2
 
3
- __version__ = "1.35.1"
3
+ __version__ = "1.35.3"
4
4
 
5
5
  __all__ = ["__version__"]
sqlcg/indexer/indexer.py CHANGED
@@ -388,7 +388,7 @@ def _flush_row_batch(
388
388
  )
389
389
 
390
390
 
391
- def _subprocess_parse_worker(parser_cls, dialect, path, sql, q):
391
+ def _subprocess_parse_worker(parser_cls, dialect, path, sql, q, rel_path=None):
392
392
  """Parse a single file in a subprocess; queue the ParsedFile (or exception).
393
393
 
394
394
  parser_cls must be the *class* (pickleable), not an instance. The worker
@@ -398,10 +398,16 @@ def _subprocess_parse_worker(parser_cls, dialect, path, sql, q):
398
398
  T-09-04: Parser constructors require a SchemaResolver. The subprocess gets a
399
399
  fresh empty resolver; column resolution runs in infer-only mode, the same as
400
400
  small-repo mode.
401
+
402
+ #171: rel_path is the repo-relative posix path used for CTE/temp namespace
403
+ keying. It MUST be forwarded so the incremental path produces the same keys
404
+ as index_repo (which threads rel_path through its task dict); without it the
405
+ namespace falls back to the absolute OS path, creating duplicate CTE/temp
406
+ nodes after an incremental reindex.
401
407
  """
402
408
  try:
403
409
  parser = parser_cls(SchemaResolver(dialect=str(dialect) if dialect else None))
404
- out = parser.parse_file(path, sql)
410
+ out = parser.parse_file(path, sql, rel_path=rel_path)
405
411
  q.put(out)
406
412
  except BaseException as exc:
407
413
  # Send the exception back; parent will re-raise.
@@ -1129,6 +1135,22 @@ class Indexer:
1129
1135
  schema_resolver = SchemaResolver(dialect=dialect)
1130
1136
  parser = get_parser(dialect, schema_resolver)
1131
1137
 
1138
+ # #170/#171: load schema_aliases and compute repo-relative posix paths the
1139
+ # SAME way index_repo does, so the incremental path applies the same alias
1140
+ # normalisation (#170) and CTE/temp namespace keys (#171). Without these,
1141
+ # a branch-switch resync produced phantom *_tmp.* nodes (aliases unapplied)
1142
+ # and duplicate CTE/temp nodes (absolute-path keys).
1143
+ from sqlcg.core.config import get_schema_aliases
1144
+
1145
+ schema_aliases = get_schema_aliases(root)
1146
+ root_resolved = Path(root).resolve()
1147
+
1148
+ def _rel_posix(fp: Path) -> str:
1149
+ try:
1150
+ return fp.resolve().relative_to(root_resolved).as_posix()
1151
+ except ValueError:
1152
+ return fp.as_posix()
1153
+
1132
1154
  pass1_results: list[ParsedFile] = []
1133
1155
  for file_path in reparse_set:
1134
1156
  try:
@@ -1141,7 +1163,9 @@ class Indexer:
1141
1163
  pass1_results.append(placeholder)
1142
1164
  continue
1143
1165
  try:
1144
- parsed = self._index_single_file(parser, file_path, sql, timeout_per_file)
1166
+ parsed = self._index_single_file(
1167
+ parser, file_path, sql, timeout_per_file, rel_path=_rel_posix(file_path)
1168
+ )
1145
1169
  except Exception as exc:
1146
1170
  logger.warning("resync_changed: parse failed %s: %s", file_path, exc)
1147
1171
  parsed = ParsedFile(path=file_path, dialect=dialect)
@@ -1287,7 +1311,11 @@ class Indexer:
1287
1311
  def_path = Path(definer_fp)
1288
1312
  def_sql = def_path.read_text(encoding="utf-8")
1289
1313
  def_parsed = self._index_single_file(
1290
- parser, def_path, def_sql, timeout_per_file
1314
+ parser,
1315
+ def_path,
1316
+ def_sql,
1317
+ timeout_per_file,
1318
+ rel_path=_rel_posix(def_path),
1291
1319
  )
1292
1320
  # Harvest only — register for cross_file_sources but do NOT upsert
1293
1321
  aggregator.register_pass1(def_parsed)
@@ -1315,7 +1343,7 @@ class Indexer:
1315
1343
  )
1316
1344
  continue
1317
1345
  try:
1318
- cl_parsed = parser.parse_file(cl_path, cl_sql)
1346
+ cl_parsed = parser.parse_file(cl_path, cl_sql, rel_path=_rel_posix(cl_path))
1319
1347
  except Exception as exc:
1320
1348
  logger.warning("resync_changed: parse failed for closure file %s: %s", cl_path, exc)
1321
1349
  cl_parsed = ParsedFile(path=cl_path, dialect=dialect)
@@ -1325,7 +1353,19 @@ class Indexer:
1325
1353
  # ---- Step 7: Batched bulk upsert (same _flush_batch path as index_repo) ----
1326
1354
  all_results = pass1_results + closure_results
1327
1355
 
1328
- # Build a registry for duplicate DDL detection
1356
+ # #170: key-normalisation choke point apply schema_aliases + empty-identity
1357
+ # guard to EVERY parse result BEFORE the defined_table_registry is built and
1358
+ # before _upsert_file_batch. index_repo (line ~797) and reindex_file
1359
+ # (line ~1421) both do this; resync_changed previously did not, so a
1360
+ # branch-switch incremental reindex left staging-alias schemas (e.g. ba_tmp)
1361
+ # un-normalised, producing phantom *_tmp.* nodes that a from-scratch index
1362
+ # never creates. O(edges) per file, once per resync — outside the hot loop.
1363
+ from sqlcg.parsers.base import normalize_keys as _normalize_keys
1364
+
1365
+ for pf in all_results:
1366
+ _normalize_keys(pf, schema_aliases)
1367
+
1368
+ # Build a registry for duplicate DDL detection (post-normalisation full_ids)
1329
1369
  defined_table_registry: dict[str, str] = {}
1330
1370
  for pf in all_results:
1331
1371
  for table in pf.defined_tables:
@@ -1432,7 +1472,14 @@ class Indexer:
1432
1472
  # join-column edges until the next full index (plan-review BLOCKER).
1433
1473
  self._resolve_join_columns(db)
1434
1474
 
1435
- def _index_single_file(self, parser, path: Path, sql: str, timeout: int) -> ParsedFile:
1475
+ def _index_single_file(
1476
+ self,
1477
+ parser,
1478
+ path: Path,
1479
+ sql: str,
1480
+ timeout: int,
1481
+ rel_path: str | None = None,
1482
+ ) -> ParsedFile:
1436
1483
  """Parse one file, with optional timeout via subprocess isolation.
1437
1484
 
1438
1485
  T-09-04: Subprocess isolation via multiprocessing.Process + spawn context.
@@ -1446,12 +1493,17 @@ class Indexer:
1446
1493
  path: Path to the file
1447
1494
  sql: SQL text
1448
1495
  timeout: Timeout in seconds (0 = no timeout)
1496
+ rel_path: Repo-relative posix path for CTE/temp namespace keying
1497
+ (#171). Threaded through to parse_file (both the in-process and
1498
+ subprocess branch) so the incremental path produces the same
1499
+ namespace keys as index_repo. Falls back to str(path) inside the
1500
+ parser when None.
1449
1501
 
1450
1502
  Returns:
1451
1503
  ParsedFile with parse_failed flag set if timeout occurs
1452
1504
  """
1453
1505
  if timeout <= 0:
1454
- return parser.parse_file(path, sql)
1506
+ return parser.parse_file(path, sql, rel_path=rel_path)
1455
1507
 
1456
1508
  ctx = mp.get_context("spawn") # avoid fork-inherit pitfalls (KuzuDB connection FD etc.)
1457
1509
  # Unbounded queue: the child writes one large ParsedFile (192–552 KB pickled).
@@ -1462,7 +1514,7 @@ class Indexer:
1462
1514
  q: mp.Queue = ctx.Queue()
1463
1515
  proc = ctx.Process(
1464
1516
  target=_subprocess_parse_worker,
1465
- args=(parser.__class__, parser.DIALECT, path, sql, q),
1517
+ args=(parser.__class__, parser.DIALECT, path, sql, q, rel_path),
1466
1518
  daemon=True,
1467
1519
  )
1468
1520
  proc.start()
sqlcg/parsers/base.py CHANGED
@@ -637,12 +637,14 @@ class SqlParser(ABC):
637
637
  """Return True if `_extract_column_lineage` would attempt extraction for `stmt`.
638
638
 
639
639
  Mirrors the type/body checks `_extract_column_lineage` (below, this class)
640
- already performs: only `exp.Select`, `exp.Insert` with a `SELECT` body, and
641
- `exp.Create` with a `Select`/`Subquery` body (CTAS / CREATE VIEW AS SELECT)
642
- can ever produce column-lineage edges. `exp.Merge` is explicitly skipped there
643
- (T-07-06, deferred). Everything else (Update, Delete, Use, Set, Comment, Drop,
644
- Alter, Command, TruncateTable, CREATE ... LIKE/CLONE/column-defs, INSERT ...
645
- VALUES) is structurally lineage-free regardless of `build_scope`'s outcome.
640
+ already performs: only `exp.Select`, `exp.Insert` with a `SELECT` body,
641
+ `exp.Create` with a `Select`/`Subquery` body (CTAS / CREATE VIEW AS SELECT),
642
+ and `exp.Merge` (column edges extracted structurally from its WHEN clauses —
643
+ see `_extract_merge_lineage`, plan/sprints/sprint_snowflake_lineage_patterns.md
644
+ PR-C) can ever produce column-lineage edges. Everything else (Update, Delete,
645
+ Use, Set, Comment, Drop, Alter, Command, TruncateTable, CREATE ...
646
+ LIKE/CLONE/column-defs, INSERT ... VALUES) is structurally lineage-free
647
+ regardless of `build_scope`'s outcome.
646
648
 
647
649
  Used by `AnsiParser._parse_statement` to avoid marking `parse_failed=True`
648
650
  for statement kinds that were never going to produce column lineage in the
@@ -658,8 +660,185 @@ class SqlParser(ABC):
658
660
  return isinstance(stmt.expression, exp.Select)
659
661
  if isinstance(stmt, exp.Create):
660
662
  return isinstance(stmt.expression, (exp.Select, exp.Subquery))
663
+ if isinstance(stmt, exp.Merge):
664
+ return True
661
665
  return False
662
666
 
667
+ def _extract_merge_lineage(
668
+ self,
669
+ stmt: Any,
670
+ dst_table: "TableRef | None",
671
+ query_sources: list["TableRef"] | None,
672
+ out: ParsedFile,
673
+ ) -> list[LineageEdge]:
674
+ """Extract column-level lineage from a MERGE statement's WHEN clauses.
675
+
676
+ MERGE column edges cannot come from sqlglot's ``lineage()`` (it does not model
677
+ MERGE branches), but the ``WHEN MATCHED ... UPDATE SET`` and
678
+ ``WHEN NOT MATCHED ... INSERT (cols) VALUES (vals)`` clauses are structurally
679
+ extractable by a direct AST walk. This method performs that walk: it is bounded
680
+ and runs ONCE PER MERGE statement — it NEVER calls ``sg_lineage``, ``qualify``,
681
+ ``build_scope`` or ``exp.expand`` (AC-C7,
682
+ plan/sprints/sprint_snowflake_lineage_patterns.md PR-C).
683
+
684
+ Resolution model:
685
+ - The MERGE target (``stmt.this``) and source (``stmt.args['using']``) are both
686
+ captured in ``query_sources`` as already-qualified TableRefs (the table-grain
687
+ path does NOT split them into target/sources for MERGE — ``dst_table`` is
688
+ ``None`` here, so AC-C3's shipped table-grain behaviour is preserved). This
689
+ method matches ``stmt.this``/``using`` back to those refs by alias+name to
690
+ identify which is the target and which the source(s), then builds an
691
+ alias -> TableRef map over both.
692
+ - MATCHED UPDATE: each ``exp.EQ`` is ``target_col = source_expr``; the dst is a
693
+ column on the target, the src columns are every ``exp.Column`` in the RHS
694
+ (multi-source expressions emit one edge per source column — AC-C4). A
695
+ pure-literal RHS (no ``exp.Column``) contributes no edge (AC-C5) — mirrors the
696
+ pure-literal skip invariant.
697
+ - NOT MATCHED INSERT: ``then.this`` is the target column tuple, ``then.expression``
698
+ the values tuple; zip positionally and emit per source column. An INSERT with
699
+ no column list (``then.this is None``) is skip-and-logged
700
+ (``col_lineage_skip:merge_no_collist:``) — NO positional DDL fallback (gate
701
+ decision: positional guessing risks wrong edges).
702
+
703
+ Args:
704
+ stmt: the ``exp.Merge`` AST node.
705
+ dst_table: the resolved INSERT/CREATE target TableRef from the caller. For
706
+ MERGE this is ``None`` (the table-grain path leaves the target in
707
+ ``query_sources``); the target is recovered here from ``stmt.this``.
708
+ query_sources: resolved table refs for the MERGE (target + ``USING``
709
+ source(s)), each carrying its ``.alias``, fully qualified.
710
+ out: ParsedFile, for skip/unresolved error records.
711
+
712
+ Returns:
713
+ List of LineageEdge (may be empty).
714
+ """
715
+ import sqlglot.expressions as exp
716
+
717
+ edges: list[LineageEdge] = []
718
+ refs = list(query_sources or [])
719
+ if dst_table is not None:
720
+ refs.append(dst_table)
721
+
722
+ # Identify the MERGE target ref by matching stmt.this against the resolved refs.
723
+ # `_real_tables` does not preserve the AST alias on the TableRef, so match on the
724
+ # AST target's name (and alias, as a fallback) against each ref's name/alias.
725
+ # The remaining refs are the USING source(s).
726
+ target_ast = stmt.this if isinstance(stmt.this, exp.Table) else None
727
+ target_keys: set[str] = set()
728
+ if target_ast is not None:
729
+ if target_ast.name:
730
+ target_keys.add(target_ast.name.lower())
731
+ if target_ast.alias:
732
+ target_keys.add(target_ast.alias.lower())
733
+
734
+ def _ref_keys(ref: TableRef) -> set[str]:
735
+ keys: set[str] = set()
736
+ if ref.alias:
737
+ keys.add(ref.alias.lower())
738
+ if ref.name:
739
+ keys.add(ref.name.lower())
740
+ return keys
741
+
742
+ target_ref: TableRef | None = None
743
+ if target_keys:
744
+ for ref in refs:
745
+ if target_keys & _ref_keys(ref):
746
+ target_ref = ref
747
+ break
748
+ if target_ref is None:
749
+ label = ".".join(sorted(target_keys)) or "<unknown>"
750
+ out.errors.append(f"col_lineage_skip:merge_no_target:{label}")
751
+ return edges
752
+
753
+ source_refs = [r for r in refs if r is not target_ref]
754
+
755
+ # Build an alias/name -> TableRef map over the target + USING source(s).
756
+ # MERGE column references qualify by the table alias (e.g. `target.name`,
757
+ # `s.name`). `_real_tables` strips the alias from the resolved TableRef, so the
758
+ # alias is recovered from the AST nodes (target=stmt.this, source=using) and
759
+ # mapped to its resolved ref. Bare table names are registered as a fallback.
760
+ alias_map: dict[str, TableRef] = {}
761
+
762
+ def _register(ref: TableRef | None, ast_node: Any) -> None:
763
+ if ref is None:
764
+ return
765
+ if isinstance(ast_node, exp.Table) and ast_node.alias:
766
+ alias_map.setdefault(ast_node.alias.lower(), ref)
767
+ if ref.alias:
768
+ alias_map.setdefault(ref.alias.lower(), ref)
769
+ if ref.name:
770
+ alias_map.setdefault(ref.name.lower(), ref)
771
+
772
+ using_ast = stmt.args.get("using")
773
+ _register(target_ref, target_ast)
774
+ for src in source_refs:
775
+ _register(src, using_ast if len(source_refs) == 1 else None)
776
+
777
+ def _resolve_table(col: Any) -> TableRef | None:
778
+ """Resolve the owning TableRef of a source column reference."""
779
+ tbl_name = col.table
780
+ if tbl_name:
781
+ return alias_map.get(tbl_name.lower())
782
+ # Unqualified RHS column — attribute to the lone source when unambiguous.
783
+ if len(source_refs) == 1:
784
+ return source_refs[0]
785
+ return None
786
+
787
+ def _emit(target_col_name: str, rhs: Any, transform: str) -> None:
788
+ """Emit one edge per source ``exp.Column`` found in ``rhs``."""
789
+ if not target_col_name:
790
+ return
791
+ cols = list(rhs.find_all(exp.Column)) if rhs is not None else []
792
+ if not cols:
793
+ # Pure-literal / no source column — no edge (AC-C5).
794
+ return
795
+ dst_ref = ColumnRef(table=target_ref, name=target_col_name)
796
+ for col in cols:
797
+ src_table = _resolve_table(col)
798
+ if src_table is None:
799
+ out.errors.append(
800
+ f"col_lineage_skip:merge_unresolved:{col.table or ''}.{col.name}"
801
+ )
802
+ continue
803
+ edges.append(
804
+ LineageEdge(
805
+ src=ColumnRef(table=src_table, name=col.name),
806
+ dst=dst_ref,
807
+ transform=transform,
808
+ )
809
+ )
810
+
811
+ for when in stmt.find_all(exp.When):
812
+ then = when.args.get("then")
813
+ if isinstance(then, exp.Update):
814
+ # WHEN MATCHED THEN UPDATE SET target.a = s.a, target.b = s.x + s.y
815
+ for eq in then.expressions:
816
+ if not isinstance(eq, exp.EQ):
817
+ continue
818
+ lhs = eq.this
819
+ if not isinstance(lhs, exp.Column):
820
+ continue
821
+ _emit(lhs.name, eq.expression, "MERGE_UPDATE")
822
+ elif isinstance(then, exp.Insert):
823
+ # WHEN NOT MATCHED THEN INSERT (cols) VALUES (vals)
824
+ target_cols = then.this
825
+ values = then.expression
826
+ if target_cols is None:
827
+ # INSERT VALUES with no column list — skip-and-log, no positional
828
+ # DDL fallback (gate decision: positional guessing risks wrong edges).
829
+ out.errors.append(f"col_lineage_skip:merge_no_collist:{target_ref.full_id}")
830
+ continue
831
+ col_exprs = list(getattr(target_cols, "expressions", []) or [])
832
+ val_exprs = list(getattr(values, "expressions", []) or [])
833
+ # Positional alignment of (cols) with VALUES (...); a malformed MERGE
834
+ # with mismatched arities zips to the shorter list rather than raising.
835
+ for target_col, value in zip(col_exprs, val_exprs, strict=False):
836
+ if not isinstance(target_col, exp.Column):
837
+ continue
838
+ _emit(target_col.name, value, "MERGE_INSERT")
839
+
840
+ return edges
841
+
663
842
  def _real_tables(self, scope: Any) -> list[TableRef]:
664
843
  """Return real (non-CTE) tables referenced in a scope.
665
844
 
@@ -1117,19 +1296,15 @@ class SqlParser(ABC):
1117
1296
  join_col_resolves: list[JoinColResolve] = []
1118
1297
  _qualify_failed: bool = False
1119
1298
 
1120
- # NEW (T-07-06): Record MERGE statements explicitly as deferred.
1121
- # sqlglot's lineage() API does not handle MERGE branches; implementing
1122
- # multi-branch lineage is deferred (see plan/sprints/sprint_07_open_ecodes.md § T-07-06).
1123
- # TODO: Remove when sqlglot adds MERGE lineage support (T-07-06).
1299
+ # MERGE column lineage (un-defer T-07-06,
1300
+ # plan/sprints/sprint_snowflake_lineage_patterns.md PR-C). sqlglot's lineage()
1301
+ # API does not handle MERGE branches, but the WHEN MATCHED UPDATE SET / WHEN NOT
1302
+ # MATCHED INSERT VALUES clauses are structurally extractable by a direct AST walk.
1303
+ # This is a bounded once-per-statement path that NEVER calls sg_lineage / qualify /
1304
+ # build_scope / exp.expand (AC-C7).
1124
1305
  if isinstance(stmt, exp.Merge):
1125
- dst_name = None
1126
- if stmt.this is not None:
1127
- try:
1128
- dst_name = stmt.this.name
1129
- except Exception:
1130
- dst_name = None
1131
- out.errors.append(f"col_lineage_skip:merge_branch:{dst_name or '<unknown>'}")
1132
- return LineageExtraction(edges=edges, star_sources=star_sources)
1306
+ merge_edges = self._extract_merge_lineage(stmt, dst_table, query_sources, out)
1307
+ return LineageExtraction(edges=merge_edges, star_sources=star_sources)
1133
1308
 
1134
1309
  # Only extract column lineage for certain statement types
1135
1310
  if not isinstance(stmt, (exp.Select, exp.Insert, exp.Create)):
@@ -35,12 +35,16 @@ class BigQueryParser(AnsiParser):
35
35
  """
36
36
  super().__init__(schema_resolver, schema_aliases=schema_aliases)
37
37
 
38
- def parse_file(self, path: Path, sql: str) -> ParsedFile:
38
+ def parse_file(self, path: Path, sql: str, rel_path: str | None = None) -> ParsedFile:
39
39
  """Parse BigQuery SQL file with scripting block detection.
40
40
 
41
41
  Args:
42
42
  path: Path to the source file
43
43
  sql: SQL text to parse
44
+ rel_path: Repo-relative posix path for CTE/temp namespace keying.
45
+ Accepted for signature parity with the other parsers (the pool and
46
+ resync paths always pass it); scripting-fallback BigQuery files do
47
+ not register CTE/temp nodes, so it is currently unused here.
44
48
 
45
49
  Returns:
46
50
  ParsedFile with parsed statements and metadata
@@ -0,0 +1,170 @@
1
+ """Dialect-agnostic bounded constant-fold of dynamic table-name expressions.
2
+
3
+ This is the pure core of the "generic variable-name resolution" feature
4
+ (``plan/sprints/feature_generic_var_name_resolution.md``). It partially folds a
5
+ string-valued expression AST (string literals, ``||`` / ``CONCAT``, and 1-hop
6
+ ``$var`` references) into a concrete ``[catalog.]db.name`` table reference,
7
+ recovering the statically-determined trailing identifier components while
8
+ honestly giving up when the name cannot be determined.
9
+
10
+ No parser/indexer state is touched here: ``resolve_dynamic_name`` is a pure
11
+ function, fully unit-testable in isolation. PR-2 wires it into the Snowflake
12
+ parser via a sink predicate; this module ships unwired.
13
+
14
+ Fold classification:
15
+
16
+ * **LIT (resolvable):** ``exp.Literal`` (string), ``exp.DPipe`` / ``exp.Concat``
17
+ over folds, and a 1-hop ``$var`` whose RHS (looked up in ``var_env`` exactly
18
+ once) itself folds to a leading constant prefix.
19
+ * **OPAQUE (unresolvable):** runtime functions (``current_database()``,
20
+ ``split_part(...)``, ``current_warehouse()``, any ``exp.Func`` /
21
+ ``exp.Anonymous`` that is not a pure string op), scalar subqueries that are not
22
+ a single-projection of folds, bind parameters (``exp.Placeholder``), and any
23
+ ``$var`` reference at chain depth >= 2.
24
+
25
+ Name extraction interprets the fold as ``[catalog.]db.name`` (name-last,
26
+ dot-split): it keeps the rightmost statically-determined ``db.name`` tail,
27
+ wildcards (drops) a leading OPAQUE catalog, keeps a static *literal* catalog, and
28
+ returns ``None`` when the whole thing is opaque / the tail is a bare name / the
29
+ tail lacks a resolvable ``db.name``.
30
+ """
31
+
32
+ from __future__ import annotations
33
+
34
+ from dataclasses import dataclass
35
+
36
+ import sqlglot.expressions as exp
37
+
38
+
39
+ @dataclass(frozen=True)
40
+ class _Lit:
41
+ """A statically-known string segment of the folded expression."""
42
+
43
+ text: str
44
+
45
+
46
+ @dataclass(frozen=True)
47
+ class _Opaque:
48
+ """A runtime-determined segment that cannot be folded statically."""
49
+
50
+
51
+ _Part = _Lit | _Opaque
52
+
53
+
54
+ def _fold_parts(
55
+ node: exp.Expression, # type: ignore[attr-defined]
56
+ var_env: dict[str, exp.Expression], # type: ignore[attr-defined]
57
+ *,
58
+ chain_depth: int,
59
+ ) -> list[_Part]:
60
+ """Fold ``node`` into an ordered list of LIT / OPAQUE parts.
61
+
62
+ Descends string-concatenation structure (``||`` / ``CONCAT``), single-
63
+ projection subqueries and parentheses; resolves 1-hop ``$var`` references
64
+ against ``var_env``; classifies everything else OPAQUE.
65
+ """
66
+ # Unwrap a single-projection (SELECT ...) scalar subquery / paren.
67
+ if isinstance(node, exp.Subquery):
68
+ return _fold_parts(node.this, var_env, chain_depth=chain_depth)
69
+ if isinstance(node, exp.Paren):
70
+ return _fold_parts(node.this, var_env, chain_depth=chain_depth)
71
+ if isinstance(node, exp.Select):
72
+ projections = node.expressions
73
+ if len(projections) != 1:
74
+ return [_Opaque()]
75
+ return _fold_parts(projections[0], var_env, chain_depth=chain_depth)
76
+
77
+ # String concatenation: DPipe is binary (left-nested), Concat is n-ary.
78
+ if isinstance(node, exp.DPipe):
79
+ return _fold_parts(node.this, var_env, chain_depth=chain_depth) + _fold_parts(
80
+ node.expression, var_env, chain_depth=chain_depth
81
+ )
82
+ if isinstance(node, exp.Concat):
83
+ parts: list[_Part] = []
84
+ for child in node.expressions:
85
+ parts.extend(_fold_parts(child, var_env, chain_depth=chain_depth))
86
+ return parts
87
+
88
+ # Concrete string literal.
89
+ if isinstance(node, exp.Literal) and node.is_string:
90
+ return [_Lit(node.this)]
91
+
92
+ # 1-hop $var reference.
93
+ if isinstance(node, exp.Parameter):
94
+ var = node.this
95
+ name = var.name if isinstance(var, exp.Var) else None
96
+ if name is None or chain_depth < 1:
97
+ return [_Opaque()]
98
+ rhs = var_env.get(name.lower())
99
+ if rhs is None:
100
+ return [_Opaque()]
101
+ # Resolve exactly one hop: the looked-up RHS may not itself follow
102
+ # further $vars (depth-2 is OPAQUE because chain_depth drops to 0).
103
+ return _fold_parts(rhs, var_env, chain_depth=chain_depth - 1)
104
+
105
+ # Everything else (runtime funcs, bind params, non-fold subqueries) is opaque.
106
+ return [_Opaque()]
107
+
108
+
109
+ def resolve_dynamic_name(
110
+ rhs_expr: exp.Expression, # type: ignore[attr-defined]
111
+ var_env: dict[str, exp.Expression], # type: ignore[attr-defined]
112
+ *,
113
+ chain_depth: int = 1,
114
+ ) -> exp.Table | None:
115
+ """Bounded partial constant-fold of a string-expression AST into a TableRef.
116
+
117
+ Args:
118
+ rhs_expr: the RHS expression assigned to the dynamic name (the AST that
119
+ the ``IDENTIFIER($var)`` sink dereferences).
120
+ var_env: lowercased var name -> its RHS AST, for 1-hop chain resolution.
121
+ chain_depth: max var-lookup hops (default 1; corpus max is 1). A var
122
+ whose value references another var resolves only the first hop.
123
+
124
+ Returns:
125
+ An ``exp.Table`` for the resolvable trailing identifier components, or
126
+ ``None`` to give up honestly (caller leaves the sink dropped).
127
+ """
128
+ parts = _fold_parts(rhs_expr, var_env, chain_depth=chain_depth)
129
+
130
+ if not parts:
131
+ return None
132
+
133
+ # Take the rightmost contiguous run of LIT parts (the static suffix), noting
134
+ # whether an OPAQUE segment sits immediately before that run (catalog
135
+ # position is then runtime-determined).
136
+ suffix_lits: list[str] = []
137
+ opaque_precedes_suffix = False
138
+ for part in reversed(parts):
139
+ if isinstance(part, _Lit):
140
+ suffix_lits.append(part.text)
141
+ else:
142
+ opaque_precedes_suffix = True
143
+ break
144
+ suffix_lits.reverse()
145
+
146
+ if not suffix_lits:
147
+ # All-opaque (no static tail at all).
148
+ return None
149
+
150
+ suffix = "".join(suffix_lits)
151
+ table = exp.to_table(suffix, dialect="snowflake")
152
+
153
+ # A resolvable object id needs at least db.name (schema.table). A bare name
154
+ # (no db) is a give-up: we never guess the schema/table. A missing name (tail
155
+ # ends on a dot) is likewise unresolvable.
156
+ if table.db == "" or table.name == "":
157
+ return None
158
+
159
+ # Catalog policy:
160
+ # - literal catalog present in the static suffix (3-part name) -> KEEP it.
161
+ # - an OPAQUE segment precedes the suffix (the catalog slot came from a
162
+ # runtime segment) -> wildcard / drop the catalog (emit catalog-less
163
+ # db.name) so the node merges with DDL / plain-SQL refs to the same table.
164
+ # to_table already yields catalog='' when the static suffix had no catalog
165
+ # component (e.g. '.EMP.APPLICATION_FIELDS' or 'DHB.KOSTEN'); the explicit
166
+ # drop only matters when an opaque prefix sits before a literal 3-part tail.
167
+ if opaque_precedes_suffix and table.catalog != "":
168
+ table.set("catalog", None)
169
+
170
+ return table