PyPI - sql-code-graph - Versions diffs - 1.35.1__py3-none-any.whl → 1.35.3__py3-none-any.whl - Mend

sql-code-graph 1.35.1py3-none-any.whl → 1.35.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

{sql_code_graph-1.35.1.dist-info → sql_code_graph-1.35.3.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: sql-code-graph
-Version: 1.35.1
+Version: 1.35.3
 Summary: SQL code graph analyzer and lineage tracer
 Project-URL: Homepage, https://github.com/Warhorze/sql-code-graph
 Project-URL: Repository, https://github.com/Warhorze/sql-code-graph

{sql_code_graph-1.35.1.dist-info → sql_code_graph-1.35.3.dist-info}/RECORD RENAMED Viewed

@@ -1,4 +1,4 @@
-sqlcg/__init__.py,sha256=VahDFz44BfVysbx4bUBilcYdqMPT5hNDCMqoIkgonw0,116
+sqlcg/__init__.py,sha256=iuiB1QBl9EVtoW0aNt89z6gr_yDaZ5RV7phaFO8zX0Y,116
 sqlcg/__main__.py,sha256=1YoFLcqEgTwYq1J3TbUwpkdG0zeeLIf2fJvwWI-CLFU,109
 sqlcg/cli/__init__.py,sha256=W8fD0LpMq2xm_5WKGNMvJh2WBL1ho5E8hUeAqXQYT1g,28
 sqlcg/cli/coverage.py,sha256=Xm9ITzZDHv2mJ70Q5jCacVuhDStVrE3gq12_-Ypvtd8,43823
@@ -34,7 +34,7 @@ sqlcg/indexer/__init__.py,sha256=Wh20Unz2OHs1oIyWLrpurPAasF0BET2g4iXtNk7mh2U,56
 sqlcg/indexer/dbt_adapter.py,sha256=EB5x1WU5Z9d-I97ADDj88S_hG1C4z4nbrv8JUCzXfy8,686
 sqlcg/indexer/error_classify.py,sha256=-sp8cRmuOBHu_CxnCtaXf34YxHFYwIFNjIrn4LaEv6M,7142
 sqlcg/indexer/git_delta.py,sha256=zYdH5q-jV7w_ne8Oxdywsy0N3rwUjpd5RjEDurlrMSA,5026
-sqlcg/indexer/indexer.py,sha256=u6fIhgLfBs9fKJoAyO4hJrm9152YMe08EHycncctQx8,102697
+sqlcg/indexer/indexer.py,sha256=LNc5pI3_WwIMT2iTnCu9VB1Kk21T1qAmN7fupyiQu2s,105315
 sqlcg/indexer/pool.py,sha256=iMmCQtpDRKBTQBep2_EUq9THcsE18Zgk0hdaFB_CwiA,19006
 sqlcg/indexer/walker.py,sha256=Cft6JiJtdBFy0HR6L9pJdr5Fg0eRR3XBW1OMtM2apto,1947
 sqlcg/indexer/watcher.py,sha256=mJQq1LASRLKKwhz0WhCUWPLLqyPR2_-FD_8efYU6gE8,8442
@@ -45,8 +45,9 @@ sqlcg/metrics/__init__.py,sha256=hLJ6wm4St8qqYwKh3o9QG7lcEt1BEYM31ccqO9tGpIg,133
 sqlcg/metrics/store.py,sha256=KuDtxvyAgug9_KtiSCpvgKM2VZM7VSaI3D11uMLjJJk,10604
 sqlcg/parsers/__init__.py,sha256=AamA8wBbDZV9_zEtZCI4Hyen5UAVKHmBwjTghTt2PZE,785
 sqlcg/parsers/ansi_parser.py,sha256=RX6eVj7gt1qmsHNJLAF_a4jyW3RCI5W2oF4rd53cKNg,39336
-sqlcg/parsers/base.py,sha256=Q2X_qN7DmGFU2oA6Z9fcxyjpkeb6lOsia4BcwPvwNLo,97806
-sqlcg/parsers/bigquery_parser.py,sha256=mOnWTfXB_Dp4JwFE1PVYOB6CDPf5nYE0Dea8kJCl9uQ,2827
+sqlcg/parsers/base.py,sha256=d5s5_LSv96jrww9vx52GujjrLHwpxy_UOhmIlWcKglw,106489
+sqlcg/parsers/bigquery_parser.py,sha256=g0B6aIpMyxLMVQ3ohAAjzR4nEmMh-WGkFcYLMiKdLxs,3177
+sqlcg/parsers/dynamic_name.py,sha256=q0QAa9iAcmRW4e_0G2b2j-xTbI3VR1-Wwa-nJRLtrQw,6836
 sqlcg/parsers/postgres_parser.py,sha256=lYfUpQY6j4Qm7ndXBtXbgPoGzYqYddWt5YeFnWKdA6I,946
 sqlcg/parsers/registry.py,sha256=LXy1F6rqQI6VdxpRvZg_tNpoEucW3mXZHYBMlMONbX4,1496
 sqlcg/parsers/snowflake_parser.py,sha256=cv7bzBm6Wmwa8uY41Y59ebfFjnP1Gk0Sjp2KN_QBGD8,47542
@@ -72,7 +73,7 @@ sqlcg/viz/render.py,sha256=BINkGbJbbb_iqhrkN795RaQsdg8nqCiJtsEFF1yo22Y,2737
 sqlcg/viz/tags.py,sha256=6zRnGlHjuGmEeB6yN1uhzm8rqL7ZGoyL1Ki7jI5oM6A,5368
 sqlcg/viz/assets/force-graph.min.js,sha256=jNdYdDdrYiUdUlElxRkolPBt30rstQk2q15Q32VVdzc,177272
 sqlcg/viz/assets/template.html,sha256=9_j-mvo1ZxwgiJPDdVrNmca37dTrTjjYVd3977u-DxE,12294
-sql_code_graph-1.35.1.dist-info/METADATA,sha256=YOc2OzDrdadgZdpmY3QhwueUwScKj39svzKO4dgd9Q0,17791
-sql_code_graph-1.35.1.dist-info/WHEEL,sha256=mffPy8wBnZQn2VnJUU5jE99KsxaSfiyMHV9Yt0aLVxs,87
-sql_code_graph-1.35.1.dist-info/entry_points.txt,sha256=Wfe49sVzV9p4eVFGo5RxcV-frr3HOP0yzzst8JBxQLQ,46
-sql_code_graph-1.35.1.dist-info/RECORD,,
+sql_code_graph-1.35.3.dist-info/METADATA,sha256=bR0GUYuujbDEYNj4602aE5Olejev4X6hp7KYlaezZjg,17791
+sql_code_graph-1.35.3.dist-info/WHEEL,sha256=mffPy8wBnZQn2VnJUU5jE99KsxaSfiyMHV9Yt0aLVxs,87
+sql_code_graph-1.35.3.dist-info/entry_points.txt,sha256=Wfe49sVzV9p4eVFGo5RxcV-frr3HOP0yzzst8JBxQLQ,46
+sql_code_graph-1.35.3.dist-info/RECORD,,

sqlcg/__init__.py CHANGED Viewed

@@ -1,5 +1,5 @@
 """SQL Code Graph - SQL lineage and dependency analysis tool."""
-__version__ = "1.35.1"
+__version__ = "1.35.3"
 __all__ = ["__version__"]

sqlcg/indexer/indexer.py CHANGED Viewed

@@ -388,7 +388,7 @@ def _flush_row_batch(
     )
-def _subprocess_parse_worker(parser_cls, dialect, path, sql, q):
+def _subprocess_parse_worker(parser_cls, dialect, path, sql, q, rel_path=None):
     """Parse a single file in a subprocess; queue the ParsedFile (or exception).
     parser_cls must be the *class* (pickleable), not an instance. The worker
@@ -398,10 +398,16 @@ def _subprocess_parse_worker(parser_cls, dialect, path, sql, q):
     T-09-04: Parser constructors require a SchemaResolver. The subprocess gets a
     fresh empty resolver; column resolution runs in infer-only mode, the same as
     small-repo mode.
+    #171: rel_path is the repo-relative posix path used for CTE/temp namespace
+    keying.  It MUST be forwarded so the incremental path produces the same keys
+    as index_repo (which threads rel_path through its task dict); without it the
+    namespace falls back to the absolute OS path, creating duplicate CTE/temp
+    nodes after an incremental reindex.
     """
     try:
         parser = parser_cls(SchemaResolver(dialect=str(dialect) if dialect else None))
-        out = parser.parse_file(path, sql)
+        out = parser.parse_file(path, sql, rel_path=rel_path)
         q.put(out)
     except BaseException as exc:
         # Send the exception back; parent will re-raise.
@@ -1129,6 +1135,22 @@ class Indexer:
         schema_resolver = SchemaResolver(dialect=dialect)
         parser = get_parser(dialect, schema_resolver)
+        # #170/#171: load schema_aliases and compute repo-relative posix paths the
+        # SAME way index_repo does, so the incremental path applies the same alias
+        # normalisation (#170) and CTE/temp namespace keys (#171).  Without these,
+        # a branch-switch resync produced phantom *_tmp.* nodes (aliases unapplied)
+        # and duplicate CTE/temp nodes (absolute-path keys).
+        from sqlcg.core.config import get_schema_aliases
+        schema_aliases = get_schema_aliases(root)
+        root_resolved = Path(root).resolve()
+        def _rel_posix(fp: Path) -> str:
+            try:
+                return fp.resolve().relative_to(root_resolved).as_posix()
+            except ValueError:
+                return fp.as_posix()
         pass1_results: list[ParsedFile] = []
         for file_path in reparse_set:
             try:
@@ -1141,7 +1163,9 @@ class Indexer:
                 pass1_results.append(placeholder)
                 continue
             try:
-                parsed = self._index_single_file(parser, file_path, sql, timeout_per_file)
+                parsed = self._index_single_file(
+                    parser, file_path, sql, timeout_per_file, rel_path=_rel_posix(file_path)
+                )
             except Exception as exc:
                 logger.warning("resync_changed: parse failed %s: %s", file_path, exc)
                 parsed = ParsedFile(path=file_path, dialect=dialect)
@@ -1287,7 +1311,11 @@ class Indexer:
                         def_path = Path(definer_fp)
                         def_sql = def_path.read_text(encoding="utf-8")
                         def_parsed = self._index_single_file(
-                            parser, def_path, def_sql, timeout_per_file
+                            parser,
+                            def_path,
+                            def_sql,
+                            timeout_per_file,
+                            rel_path=_rel_posix(def_path),
                         )
                         # Harvest only — register for cross_file_sources but do NOT upsert
                         aggregator.register_pass1(def_parsed)
@@ -1315,7 +1343,7 @@ class Indexer:
                 )
                 continue
             try:
-                cl_parsed = parser.parse_file(cl_path, cl_sql)
+                cl_parsed = parser.parse_file(cl_path, cl_sql, rel_path=_rel_posix(cl_path))
             except Exception as exc:
                 logger.warning("resync_changed: parse failed for closure file %s: %s", cl_path, exc)
                 cl_parsed = ParsedFile(path=cl_path, dialect=dialect)
@@ -1325,7 +1353,19 @@ class Indexer:
         # ---- Step 7: Batched bulk upsert (same _flush_batch path as index_repo) ----
         all_results = pass1_results + closure_results
-        # Build a registry for duplicate DDL detection
+        # #170: key-normalisation choke point — apply schema_aliases + empty-identity
+        # guard to EVERY parse result BEFORE the defined_table_registry is built and
+        # before _upsert_file_batch.  index_repo (line ~797) and reindex_file
+        # (line ~1421) both do this; resync_changed previously did not, so a
+        # branch-switch incremental reindex left staging-alias schemas (e.g. ba_tmp)
+        # un-normalised, producing phantom *_tmp.* nodes that a from-scratch index
+        # never creates.  O(edges) per file, once per resync — outside the hot loop.
+        from sqlcg.parsers.base import normalize_keys as _normalize_keys
+        for pf in all_results:
+            _normalize_keys(pf, schema_aliases)
+        # Build a registry for duplicate DDL detection (post-normalisation full_ids)
         defined_table_registry: dict[str, str] = {}
         for pf in all_results:
             for table in pf.defined_tables:
@@ -1432,7 +1472,14 @@ class Indexer:
         # join-column edges until the next full index (plan-review BLOCKER).
         self._resolve_join_columns(db)
-    def _index_single_file(self, parser, path: Path, sql: str, timeout: int) -> ParsedFile:
+    def _index_single_file(
+        self,
+        parser,
+        path: Path,
+        sql: str,
+        timeout: int,
+        rel_path: str | None = None,
+    ) -> ParsedFile:
         """Parse one file, with optional timeout via subprocess isolation.
         T-09-04: Subprocess isolation via multiprocessing.Process + spawn context.
@@ -1446,12 +1493,17 @@ class Indexer:
             path: Path to the file
             sql: SQL text
             timeout: Timeout in seconds (0 = no timeout)
+            rel_path: Repo-relative posix path for CTE/temp namespace keying
+                (#171).  Threaded through to parse_file (both the in-process and
+                subprocess branch) so the incremental path produces the same
+                namespace keys as index_repo.  Falls back to str(path) inside the
+                parser when None.
         Returns:
             ParsedFile with parse_failed flag set if timeout occurs
         """
         if timeout <= 0:
-            return parser.parse_file(path, sql)
+            return parser.parse_file(path, sql, rel_path=rel_path)
         ctx = mp.get_context("spawn")  # avoid fork-inherit pitfalls (KuzuDB connection FD etc.)
         # Unbounded queue: the child writes one large ParsedFile (192–552 KB pickled).
@@ -1462,7 +1514,7 @@ class Indexer:
         q: mp.Queue = ctx.Queue()
         proc = ctx.Process(
             target=_subprocess_parse_worker,
-            args=(parser.__class__, parser.DIALECT, path, sql, q),
+            args=(parser.__class__, parser.DIALECT, path, sql, q, rel_path),
             daemon=True,
         )
         proc.start()

sqlcg/parsers/base.py CHANGED Viewed

@@ -637,12 +637,14 @@ class SqlParser(ABC):
         """Return True if `_extract_column_lineage` would attempt extraction for `stmt`.
         Mirrors the type/body checks `_extract_column_lineage` (below, this class)
-        already performs: only `exp.Select`, `exp.Insert` with a `SELECT` body, and
-        `exp.Create` with a `Select`/`Subquery` body (CTAS / CREATE VIEW AS SELECT)
-        can ever produce column-lineage edges. `exp.Merge` is explicitly skipped there
-        (T-07-06, deferred). Everything else (Update, Delete, Use, Set, Comment, Drop,
-        Alter, Command, TruncateTable, CREATE ... LIKE/CLONE/column-defs, INSERT ...
-        VALUES) is structurally lineage-free regardless of `build_scope`'s outcome.
+        already performs: only `exp.Select`, `exp.Insert` with a `SELECT` body,
+        `exp.Create` with a `Select`/`Subquery` body (CTAS / CREATE VIEW AS SELECT),
+        and `exp.Merge` (column edges extracted structurally from its WHEN clauses —
+        see `_extract_merge_lineage`, plan/sprints/sprint_snowflake_lineage_patterns.md
+        PR-C) can ever produce column-lineage edges. Everything else (Update, Delete,
+        Use, Set, Comment, Drop, Alter, Command, TruncateTable, CREATE ...
+        LIKE/CLONE/column-defs, INSERT ... VALUES) is structurally lineage-free
+        regardless of `build_scope`'s outcome.
         Used by `AnsiParser._parse_statement` to avoid marking `parse_failed=True`
         for statement kinds that were never going to produce column lineage in the
@@ -658,8 +660,185 @@ class SqlParser(ABC):
             return isinstance(stmt.expression, exp.Select)
         if isinstance(stmt, exp.Create):
             return isinstance(stmt.expression, (exp.Select, exp.Subquery))
+        if isinstance(stmt, exp.Merge):
+            return True
         return False
+    def _extract_merge_lineage(
+        self,
+        stmt: Any,
+        dst_table: "TableRef | None",
+        query_sources: list["TableRef"] | None,
+        out: ParsedFile,
+    ) -> list[LineageEdge]:
+        """Extract column-level lineage from a MERGE statement's WHEN clauses.
+        MERGE column edges cannot come from sqlglot's ``lineage()`` (it does not model
+        MERGE branches), but the ``WHEN MATCHED ... UPDATE SET`` and
+        ``WHEN NOT MATCHED ... INSERT (cols) VALUES (vals)`` clauses are structurally
+        extractable by a direct AST walk. This method performs that walk: it is bounded
+        and runs ONCE PER MERGE statement — it NEVER calls ``sg_lineage``, ``qualify``,
+        ``build_scope`` or ``exp.expand`` (AC-C7,
+        plan/sprints/sprint_snowflake_lineage_patterns.md PR-C).
+        Resolution model:
+        - The MERGE target (``stmt.this``) and source (``stmt.args['using']``) are both
+          captured in ``query_sources`` as already-qualified TableRefs (the table-grain
+          path does NOT split them into target/sources for MERGE — ``dst_table`` is
+          ``None`` here, so AC-C3's shipped table-grain behaviour is preserved). This
+          method matches ``stmt.this``/``using`` back to those refs by alias+name to
+          identify which is the target and which the source(s), then builds an
+          alias -> TableRef map over both.
+        - MATCHED UPDATE: each ``exp.EQ`` is ``target_col = source_expr``; the dst is a
+          column on the target, the src columns are every ``exp.Column`` in the RHS
+          (multi-source expressions emit one edge per source column — AC-C4). A
+          pure-literal RHS (no ``exp.Column``) contributes no edge (AC-C5) — mirrors the
+          pure-literal skip invariant.
+        - NOT MATCHED INSERT: ``then.this`` is the target column tuple, ``then.expression``
+          the values tuple; zip positionally and emit per source column. An INSERT with
+          no column list (``then.this is None``) is skip-and-logged
+          (``col_lineage_skip:merge_no_collist:``) — NO positional DDL fallback (gate
+          decision: positional guessing risks wrong edges).
+        Args:
+            stmt: the ``exp.Merge`` AST node.
+            dst_table: the resolved INSERT/CREATE target TableRef from the caller. For
+                MERGE this is ``None`` (the table-grain path leaves the target in
+                ``query_sources``); the target is recovered here from ``stmt.this``.
+            query_sources: resolved table refs for the MERGE (target + ``USING``
+                source(s)), each carrying its ``.alias``, fully qualified.
+            out: ParsedFile, for skip/unresolved error records.
+        Returns:
+            List of LineageEdge (may be empty).
+        """
+        import sqlglot.expressions as exp
+        edges: list[LineageEdge] = []
+        refs = list(query_sources or [])
+        if dst_table is not None:
+            refs.append(dst_table)
+        # Identify the MERGE target ref by matching stmt.this against the resolved refs.
+        # `_real_tables` does not preserve the AST alias on the TableRef, so match on the
+        # AST target's name (and alias, as a fallback) against each ref's name/alias.
+        # The remaining refs are the USING source(s).
+        target_ast = stmt.this if isinstance(stmt.this, exp.Table) else None
+        target_keys: set[str] = set()
+        if target_ast is not None:
+            if target_ast.name:
+                target_keys.add(target_ast.name.lower())
+            if target_ast.alias:
+                target_keys.add(target_ast.alias.lower())
+        def _ref_keys(ref: TableRef) -> set[str]:
+            keys: set[str] = set()
+            if ref.alias:
+                keys.add(ref.alias.lower())
+            if ref.name:
+                keys.add(ref.name.lower())
+            return keys
+        target_ref: TableRef | None = None
+        if target_keys:
+            for ref in refs:
+                if target_keys & _ref_keys(ref):
+                    target_ref = ref
+                    break
+        if target_ref is None:
+            label = ".".join(sorted(target_keys)) or "<unknown>"
+            out.errors.append(f"col_lineage_skip:merge_no_target:{label}")
+            return edges
+        source_refs = [r for r in refs if r is not target_ref]
+        # Build an alias/name -> TableRef map over the target + USING source(s).
+        # MERGE column references qualify by the table alias (e.g. `target.name`,
+        # `s.name`). `_real_tables` strips the alias from the resolved TableRef, so the
+        # alias is recovered from the AST nodes (target=stmt.this, source=using) and
+        # mapped to its resolved ref. Bare table names are registered as a fallback.
+        alias_map: dict[str, TableRef] = {}
+        def _register(ref: TableRef | None, ast_node: Any) -> None:
+            if ref is None:
+                return
+            if isinstance(ast_node, exp.Table) and ast_node.alias:
+                alias_map.setdefault(ast_node.alias.lower(), ref)
+            if ref.alias:
+                alias_map.setdefault(ref.alias.lower(), ref)
+            if ref.name:
+                alias_map.setdefault(ref.name.lower(), ref)
+        using_ast = stmt.args.get("using")
+        _register(target_ref, target_ast)
+        for src in source_refs:
+            _register(src, using_ast if len(source_refs) == 1 else None)
+        def _resolve_table(col: Any) -> TableRef | None:
+            """Resolve the owning TableRef of a source column reference."""
+            tbl_name = col.table
+            if tbl_name:
+                return alias_map.get(tbl_name.lower())
+            # Unqualified RHS column — attribute to the lone source when unambiguous.
+            if len(source_refs) == 1:
+                return source_refs[0]
+            return None
+        def _emit(target_col_name: str, rhs: Any, transform: str) -> None:
+            """Emit one edge per source ``exp.Column`` found in ``rhs``."""
+            if not target_col_name:
+                return
+            cols = list(rhs.find_all(exp.Column)) if rhs is not None else []
+            if not cols:
+                # Pure-literal / no source column — no edge (AC-C5).
+                return
+            dst_ref = ColumnRef(table=target_ref, name=target_col_name)
+            for col in cols:
+                src_table = _resolve_table(col)
+                if src_table is None:
+                    out.errors.append(
+                        f"col_lineage_skip:merge_unresolved:{col.table or ''}.{col.name}"
+                    )
+                    continue
+                edges.append(
+                    LineageEdge(
+                        src=ColumnRef(table=src_table, name=col.name),
+                        dst=dst_ref,
+                        transform=transform,
+                    )
+                )
+        for when in stmt.find_all(exp.When):
+            then = when.args.get("then")
+            if isinstance(then, exp.Update):
+                # WHEN MATCHED THEN UPDATE SET target.a = s.a, target.b = s.x + s.y
+                for eq in then.expressions:
+                    if not isinstance(eq, exp.EQ):
+                        continue
+                    lhs = eq.this
+                    if not isinstance(lhs, exp.Column):
+                        continue
+                    _emit(lhs.name, eq.expression, "MERGE_UPDATE")
+            elif isinstance(then, exp.Insert):
+                # WHEN NOT MATCHED THEN INSERT (cols) VALUES (vals)
+                target_cols = then.this
+                values = then.expression
+                if target_cols is None:
+                    # INSERT VALUES with no column list — skip-and-log, no positional
+                    # DDL fallback (gate decision: positional guessing risks wrong edges).
+                    out.errors.append(f"col_lineage_skip:merge_no_collist:{target_ref.full_id}")
+                    continue
+                col_exprs = list(getattr(target_cols, "expressions", []) or [])
+                val_exprs = list(getattr(values, "expressions", []) or [])
+                # Positional alignment of (cols) with VALUES (...); a malformed MERGE
+                # with mismatched arities zips to the shorter list rather than raising.
+                for target_col, value in zip(col_exprs, val_exprs, strict=False):
+                    if not isinstance(target_col, exp.Column):
+                        continue
+                    _emit(target_col.name, value, "MERGE_INSERT")
+        return edges
     def _real_tables(self, scope: Any) -> list[TableRef]:
         """Return real (non-CTE) tables referenced in a scope.
@@ -1117,19 +1296,15 @@ class SqlParser(ABC):
         join_col_resolves: list[JoinColResolve] = []
         _qualify_failed: bool = False
-        # NEW (T-07-06): Record MERGE statements explicitly as deferred.
-        # sqlglot's lineage() API does not handle MERGE branches; implementing
-        # multi-branch lineage is deferred (see plan/sprints/sprint_07_open_ecodes.md § T-07-06).
-        # TODO: Remove when sqlglot adds MERGE lineage support (T-07-06).
+        # MERGE column lineage (un-defer T-07-06,
+        # plan/sprints/sprint_snowflake_lineage_patterns.md PR-C). sqlglot's lineage()
+        # API does not handle MERGE branches, but the WHEN MATCHED UPDATE SET / WHEN NOT
+        # MATCHED INSERT VALUES clauses are structurally extractable by a direct AST walk.
+        # This is a bounded once-per-statement path that NEVER calls sg_lineage / qualify /
+        # build_scope / exp.expand (AC-C7).
         if isinstance(stmt, exp.Merge):
-            dst_name = None
-            if stmt.this is not None:
-                try:
-                    dst_name = stmt.this.name
-                except Exception:
-                    dst_name = None
-            out.errors.append(f"col_lineage_skip:merge_branch:{dst_name or '<unknown>'}")
-            return LineageExtraction(edges=edges, star_sources=star_sources)
+            merge_edges = self._extract_merge_lineage(stmt, dst_table, query_sources, out)
+            return LineageExtraction(edges=merge_edges, star_sources=star_sources)
         # Only extract column lineage for certain statement types
         if not isinstance(stmt, (exp.Select, exp.Insert, exp.Create)):

sqlcg/parsers/bigquery_parser.py CHANGED Viewed

@@ -35,12 +35,16 @@ class BigQueryParser(AnsiParser):
         """
         super().__init__(schema_resolver, schema_aliases=schema_aliases)
-    def parse_file(self, path: Path, sql: str) -> ParsedFile:
+    def parse_file(self, path: Path, sql: str, rel_path: str | None = None) -> ParsedFile:
         """Parse BigQuery SQL file with scripting block detection.
         Args:
             path: Path to the source file
             sql: SQL text to parse
+            rel_path: Repo-relative posix path for CTE/temp namespace keying.
+                Accepted for signature parity with the other parsers (the pool and
+                resync paths always pass it); scripting-fallback BigQuery files do
+                not register CTE/temp nodes, so it is currently unused here.
         Returns:
             ParsedFile with parsed statements and metadata

sqlcg/parsers/dynamic_name.py ADDED Viewed

@@ -0,0 +1,170 @@
+"""Dialect-agnostic bounded constant-fold of dynamic table-name expressions.
+This is the pure core of the "generic variable-name resolution" feature
+(``plan/sprints/feature_generic_var_name_resolution.md``). It partially folds a
+string-valued expression AST (string literals, ``||`` / ``CONCAT``, and 1-hop
+``$var`` references) into a concrete ``[catalog.]db.name`` table reference,
+recovering the statically-determined trailing identifier components while
+honestly giving up when the name cannot be determined.
+No parser/indexer state is touched here: ``resolve_dynamic_name`` is a pure
+function, fully unit-testable in isolation. PR-2 wires it into the Snowflake
+parser via a sink predicate; this module ships unwired.
+Fold classification:
+* **LIT (resolvable):** ``exp.Literal`` (string), ``exp.DPipe`` / ``exp.Concat``
+  over folds, and a 1-hop ``$var`` whose RHS (looked up in ``var_env`` exactly
+  once) itself folds to a leading constant prefix.
+* **OPAQUE (unresolvable):** runtime functions (``current_database()``,
+  ``split_part(...)``, ``current_warehouse()``, any ``exp.Func`` /
+  ``exp.Anonymous`` that is not a pure string op), scalar subqueries that are not
+  a single-projection of folds, bind parameters (``exp.Placeholder``), and any
+  ``$var`` reference at chain depth >= 2.
+Name extraction interprets the fold as ``[catalog.]db.name`` (name-last,
+dot-split): it keeps the rightmost statically-determined ``db.name`` tail,
+wildcards (drops) a leading OPAQUE catalog, keeps a static *literal* catalog, and
+returns ``None`` when the whole thing is opaque / the tail is a bare name / the
+tail lacks a resolvable ``db.name``.
+"""
+from __future__ import annotations
+from dataclasses import dataclass
+import sqlglot.expressions as exp
+@dataclass(frozen=True)
+class _Lit:
+    """A statically-known string segment of the folded expression."""
+    text: str
+@dataclass(frozen=True)
+class _Opaque:
+    """A runtime-determined segment that cannot be folded statically."""
+_Part = _Lit | _Opaque
+def _fold_parts(
+    node: exp.Expression,  # type: ignore[attr-defined]
+    var_env: dict[str, exp.Expression],  # type: ignore[attr-defined]
+    *,
+    chain_depth: int,
+) -> list[_Part]:
+    """Fold ``node`` into an ordered list of LIT / OPAQUE parts.
+    Descends string-concatenation structure (``||`` / ``CONCAT``), single-
+    projection subqueries and parentheses; resolves 1-hop ``$var`` references
+    against ``var_env``; classifies everything else OPAQUE.
+    """
+    # Unwrap a single-projection (SELECT ...) scalar subquery / paren.
+    if isinstance(node, exp.Subquery):
+        return _fold_parts(node.this, var_env, chain_depth=chain_depth)
+    if isinstance(node, exp.Paren):
+        return _fold_parts(node.this, var_env, chain_depth=chain_depth)
+    if isinstance(node, exp.Select):
+        projections = node.expressions
+        if len(projections) != 1:
+            return [_Opaque()]
+        return _fold_parts(projections[0], var_env, chain_depth=chain_depth)
+    # String concatenation: DPipe is binary (left-nested), Concat is n-ary.
+    if isinstance(node, exp.DPipe):
+        return _fold_parts(node.this, var_env, chain_depth=chain_depth) + _fold_parts(
+            node.expression, var_env, chain_depth=chain_depth
+        )
+    if isinstance(node, exp.Concat):
+        parts: list[_Part] = []
+        for child in node.expressions:
+            parts.extend(_fold_parts(child, var_env, chain_depth=chain_depth))
+        return parts
+    # Concrete string literal.
+    if isinstance(node, exp.Literal) and node.is_string:
+        return [_Lit(node.this)]
+    # 1-hop $var reference.
+    if isinstance(node, exp.Parameter):
+        var = node.this
+        name = var.name if isinstance(var, exp.Var) else None
+        if name is None or chain_depth < 1:
+            return [_Opaque()]
+        rhs = var_env.get(name.lower())
+        if rhs is None:
+            return [_Opaque()]
+        # Resolve exactly one hop: the looked-up RHS may not itself follow
+        # further $vars (depth-2 is OPAQUE because chain_depth drops to 0).
+        return _fold_parts(rhs, var_env, chain_depth=chain_depth - 1)
+    # Everything else (runtime funcs, bind params, non-fold subqueries) is opaque.
+    return [_Opaque()]
+def resolve_dynamic_name(
+    rhs_expr: exp.Expression,  # type: ignore[attr-defined]
+    var_env: dict[str, exp.Expression],  # type: ignore[attr-defined]
+    *,
+    chain_depth: int = 1,
+) -> exp.Table | None:
+    """Bounded partial constant-fold of a string-expression AST into a TableRef.
+    Args:
+        rhs_expr: the RHS expression assigned to the dynamic name (the AST that
+            the ``IDENTIFIER($var)`` sink dereferences).
+        var_env: lowercased var name -> its RHS AST, for 1-hop chain resolution.
+        chain_depth: max var-lookup hops (default 1; corpus max is 1). A var
+            whose value references another var resolves only the first hop.
+    Returns:
+        An ``exp.Table`` for the resolvable trailing identifier components, or
+        ``None`` to give up honestly (caller leaves the sink dropped).
+    """
+    parts = _fold_parts(rhs_expr, var_env, chain_depth=chain_depth)
+    if not parts:
+        return None
+    # Take the rightmost contiguous run of LIT parts (the static suffix), noting
+    # whether an OPAQUE segment sits immediately before that run (catalog
+    # position is then runtime-determined).
+    suffix_lits: list[str] = []
+    opaque_precedes_suffix = False
+    for part in reversed(parts):
+        if isinstance(part, _Lit):
+            suffix_lits.append(part.text)
+        else:
+            opaque_precedes_suffix = True
+            break
+    suffix_lits.reverse()
+    if not suffix_lits:
+        # All-opaque (no static tail at all).
+        return None
+    suffix = "".join(suffix_lits)
+    table = exp.to_table(suffix, dialect="snowflake")
+    # A resolvable object id needs at least db.name (schema.table). A bare name
+    # (no db) is a give-up: we never guess the schema/table. A missing name (tail
+    # ends on a dot) is likewise unresolvable.
+    if table.db == "" or table.name == "":
+        return None
+    # Catalog policy:
+    #   - literal catalog present in the static suffix (3-part name) -> KEEP it.
+    #   - an OPAQUE segment precedes the suffix (the catalog slot came from a
+    #     runtime segment) -> wildcard / drop the catalog (emit catalog-less
+    #     db.name) so the node merges with DDL / plain-SQL refs to the same table.
+    # to_table already yields catalog='' when the static suffix had no catalog
+    # component (e.g. '.EMP.APPLICATION_FIELDS' or 'DHB.KOSTEN'); the explicit
+    # drop only matters when an opaque prefix sits before a literal 3-part tail.
+    if opaque_precedes_suffix and table.catalog != "":
+        table.set("catalog", None)
+    return table

{sql_code_graph-1.35.1.dist-info → sql_code_graph-1.35.3.dist-info}/WHEEL RENAMED Viewed

File without changes

{sql_code_graph-1.35.1.dist-info → sql_code_graph-1.35.3.dist-info}/entry_points.txt RENAMED Viewed

File without changes

sql-code-graph 1.35.1__py3-none-any.whl → 1.35.3__py3-none-any.whl

sql-code-graph 1.35.1py3-none-any.whl → 1.35.3py3-none-any.whl