PyPI - sql-code-graph - Versions diffs - 1.1.0__py3-none-any.whl → 1.2.2__py3-none-any.whl - Mend

sql-code-graph 1.1.0py3-none-any.whl → 1.2.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (19) hide show

{sql_code_graph-1.1.0.dist-info → sql_code_graph-1.2.2.dist-info}/METADATA +11 -1
{sql_code_graph-1.1.0.dist-info → sql_code_graph-1.2.2.dist-info}/RECORD +19 -18
{sql_code_graph-1.1.0.dist-info → sql_code_graph-1.2.2.dist-info}/WHEEL +1 -1
sqlcg/__init__.py +1 -1
sqlcg/cli/commands/analyze.py +156 -134
sqlcg/cli/commands/db.py +92 -86
sqlcg/cli/commands/find.py +30 -33
sqlcg/cli/commands/gain.py +13 -11
sqlcg/core/config.py +35 -5
sqlcg/core/kuzu_backend.py +4 -1
sqlcg/core/queries.cypher +0 -6
sqlcg/core/queries.py +0 -1
sqlcg/indexer/indexer.py +109 -11
sqlcg/lineage/aggregator.py +17 -45
sqlcg/parsers/ansi_parser.py +2 -2
sqlcg/parsers/base.py +7 -1
sqlcg/server/read_client.py +192 -0
sqlcg/server/server.py +97 -18
{sql_code_graph-1.1.0.dist-info → sql_code_graph-1.2.2.dist-info}/entry_points.txt +0 -0

sqlcg/cli/commands/db.py CHANGED Viewed

@@ -10,6 +10,7 @@ from rich.console import Console
 from sqlcg.core.config import get_backend, get_db_path
 from sqlcg.core.freshness import compute_freshness, render_freshness_line
 from sqlcg.core.schema import NodeLabel
+from sqlcg.server.read_client import run_read_routed
 from sqlcg.utils.logging import getLogger
 logger = getLogger(__name__)
@@ -75,107 +76,112 @@ def db_reset(  # noqa: B008
 @app.command("info")
 def db_info() -> None:
     """Show database stats."""
-    with get_backend() as backend:
-        version = backend.get_schema_version() or "unknown"
-        console.print(f"Schema version: {version}")
-        # Freshness block — only shown when the DB has been indexed from a git repo
+    # db info is a read-only command.  All Cypher reads route through the live
+    # server (run_read_routed) to avoid "Database is locked" while the MCP server
+    # holds the write lock.  get_schema_version / get_indexed_sha are inlined as
+    # run_read_routed calls using their known Cypher so they too route through the
+    # socket when a server is live; this avoids a direct-open that would hit the lock.
+    # Schema version
+    schema_rows = run_read_routed("MATCH (v:SchemaVersion) RETURN v.version AS version LIMIT 1", {})
+    version = (schema_rows[0]["version"] if schema_rows else None) or "unknown"
+    console.print(f"Schema version: {version}")
+    # Freshness block — only shown when the DB has been indexed from a git repo
+    try:
+        sha_rows = run_read_routed(
+            "MATCH (v:SchemaVersion) RETURN v.indexed_sha AS sha LIMIT 1", {}
+        )
+        indexed_sha = sha_rows[0]["sha"] if sha_rows else None
+        repo_rows = run_read_routed("MATCH (r:Repo) RETURN r.path AS path LIMIT 1", {})
+        if repo_rows and indexed_sha is not None and repo_rows[0].get("path"):
+            repo_root = Path(repo_rows[0]["path"])
+            f = compute_freshness(repo_root, indexed_sha)
+            console.print(render_freshness_line(f))
+    except NotImplementedError:
+        # Neo4j backend raises NotImplementedError for get_indexed_sha — skip silently
+        pass
+    except Exception as e:
+        # Any unexpected error in the freshness block must not crash db info
+        logger.debug(f"Freshness check skipped: {e}")
+    # Show node counts for all labels
+    for label in NodeLabel:
         try:
-            indexed_sha = backend.get_indexed_sha()
-            repo_rows = backend.run_read("MATCH (r:Repo) RETURN r.path AS path LIMIT 1", {})
-            if repo_rows and indexed_sha is not None and repo_rows[0].get("path"):
-                repo_root = Path(repo_rows[0]["path"])
-                f = compute_freshness(repo_root, indexed_sha)
-                console.print(render_freshness_line(f))
-        except NotImplementedError:
-            # Neo4j backend raises NotImplementedError for get_indexed_sha — skip silently
-            pass
+            result = run_read_routed(f"MATCH (n:{label}) RETURN COUNT(*) AS count", {})
+            count = result[0]["count"] if result else 0
+            console.print(f"  {label}: {count}")
         except Exception as e:
-            # Any unexpected error in the freshness block must not crash db info
-            logger.debug(f"Freshness check skipped: {e}")
-        # Show node counts for all labels
-        for label in NodeLabel:
-            try:
-                result = backend.run_read(f"MATCH (n:{label}) RETURN COUNT(*) AS count", {})
-                count = result[0]["count"] if result else 0
-                console.print(f"  {label}: {count}")
-            except Exception as e:
-                # Log unexpected exceptions instead of silently skipping
-                logger.error(f"Error getting count for {label}: {e}")
-                console.print(f"  [red]{label}: error[/red]")
-        # Health check section
-        repo_count_result = backend.run_read("MATCH (n:Repo) RETURN COUNT(n) AS count", {})
-        repo_count = repo_count_result[0]["count"] if repo_count_result else 0
-        if repo_count == 0:
-            console.print(  # noqa: E501
-                "[red]Database is empty. Run 'sqlcg db init' and 'sqlcg index <path>' first.[/red]"
+            # Log unexpected exceptions instead of silently skipping
+            logger.error(f"Error getting count for {label}: {e}")
+            console.print(f"  [red]{label}: error[/red]")
+    # Health check section
+    repo_count_result = run_read_routed("MATCH (n:Repo) RETURN COUNT(n) AS count", {})
+    repo_count = repo_count_result[0]["count"] if repo_count_result else 0
+    if repo_count == 0:
+        console.print(  # noqa: E501
+            "[red]Database is empty. Run 'sqlcg db init' and 'sqlcg index <path>' first.[/red]"
+        )
+    else:
+        query_count_result = run_read_routed("MATCH (n:SqlQuery) RETURN COUNT(n) AS count", {})
+        query_count = query_count_result[0]["count"] if query_count_result else 0
+        if query_count == 0:
+            console.print(
+                "[yellow]No queries indexed. Run 'sqlcg index <path>' to populate "
+                "the graph.[/yellow]"
             )
         else:
-            query_count_result = backend.run_read("MATCH (n:SqlQuery) RETURN COUNT(n) AS count", {})
-            query_count = query_count_result[0]["count"] if query_count_result else 0
+            col_count_result = run_read_routed("MATCH (n:SqlColumn) RETURN COUNT(n) AS count", {})
+            col_count = col_count_result[0]["count"] if col_count_result else 0
-            if query_count == 0:
+            if col_count == 0:
                 console.print(
-                    "[yellow]No queries indexed. Run 'sqlcg index <path>' to populate "
-                    "the graph.[/yellow]"
+                    "[yellow]Column lineage not available. Tools trace_column_lineage, "
+                    "get_downstream_dependencies, and get_upstream_dependencies "
+                    "will return empty results.[/yellow]"
                 )
-            else:
-                col_count_result = backend.run_read(
-                    "MATCH (n:SqlColumn) RETURN COUNT(n) AS count", {}
-                )
-                col_count = col_count_result[0]["count"] if col_count_result else 0
-                if col_count == 0:
-                    console.print(
-                        "[yellow]Column lineage not available. Tools trace_column_lineage, "
-                        "get_downstream_dependencies, and get_upstream_dependencies "
-                        "will return empty results.[/yellow]"
-                    )
-        # Print COLUMN_LINEAGE edges count
-        edges_result = backend.run_read(
-            "MATCH ()-[r:COLUMN_LINEAGE]->() RETURN COUNT(r) AS count", {}
-        )
-        edges_count = edges_result[0]["count"] if edges_result else 0
-        console.print(f"  COLUMN_LINEAGE edges: {edges_count}")
-        # Print star resolution metrics (T-07)
-        from sqlcg.core.queries import COUNT_STAR_EXPANSIONS_QUERY, COUNT_STAR_SOURCES_QUERY
+    # Print COLUMN_LINEAGE edges count
+    edges_result = run_read_routed("MATCH ()-[r:COLUMN_LINEAGE]->() RETURN COUNT(r) AS count", {})
+    edges_count = edges_result[0]["count"] if edges_result else 0
+    console.print(f"  COLUMN_LINEAGE edges: {edges_count}")
-        star_source_result = backend.run_read(COUNT_STAR_SOURCES_QUERY, {})
-        star_source_count = star_source_result[0]["n"] if star_source_result else 0
-        console.print(f"  STAR_SOURCE edges: {star_source_count}")
+    # Print star resolution metrics (T-07)
+    from sqlcg.core.queries import COUNT_STAR_EXPANSIONS_QUERY, COUNT_STAR_SOURCES_QUERY
-        star_expansion_result = backend.run_read(COUNT_STAR_EXPANSIONS_QUERY, {})
-        star_expansion_count = star_expansion_result[0]["n"] if star_expansion_result else 0
-        console.print(f"  STAR_EXPANSION lineage edges: {star_expansion_count}")
+    star_source_result = run_read_routed(COUNT_STAR_SOURCES_QUERY, {})
+    star_source_count = star_source_result[0]["n"] if star_source_result else 0
+    console.print(f"  STAR_SOURCE edges: {star_source_count}")
-        # Print parsing mode distribution
-        mode_query = (
-            "MATCH (q:SqlQuery) RETURN q.parsing_mode AS mode, COUNT(q) AS cnt ORDER BY cnt DESC"
-        )
-        mode_rows = backend.run_read(mode_query, {})
-        if mode_rows and "mode" in mode_rows[0]:
-            console.print("\n  Parsing mode distribution:")
-            for row in mode_rows:
-                console.print(f"    {row['mode']}: {row['cnt']}")
+    star_expansion_result = run_read_routed(COUNT_STAR_EXPANSIONS_QUERY, {})
+    star_expansion_count = star_expansion_result[0]["n"] if star_expansion_result else 0
+    console.print(f"  STAR_EXPANSION lineage edges: {star_expansion_count}")
+    # Print parsing mode distribution
+    mode_query = (
+        "MATCH (q:SqlQuery) RETURN q.parsing_mode AS mode, COUNT(q) AS cnt ORDER BY cnt DESC"
+    )
+    mode_rows = run_read_routed(mode_query, {})
+    if mode_rows and "mode" in mode_rows[0]:
+        console.print("\n  Parsing mode distribution:")
+        for row in mode_rows:
+            console.print(f"    {row['mode']}: {row['cnt']}")
 @app.command("list-repos")
 def list_repos() -> None:
     """List all indexed repositories."""
-    with get_backend() as backend:
-        result = backend.run_read("MATCH (r:Repo) RETURN r.path AS path, r.name AS name", {})
+    result = run_read_routed("MATCH (r:Repo) RETURN r.path AS path, r.name AS name", {})
-        if not result:
-            console.print("[yellow]No repositories indexed[/yellow]")
-        else:
-            from rich.table import Table
+    if not result:
+        console.print("[yellow]No repositories indexed[/yellow]")
+    else:
+        from rich.table import Table
-            table = Table("Path", "Name")
-            for row in result:
-                table.add_row(str(row.get("path", "")), str(row.get("name", "")))
-            console.print(table)
+        table = Table("Path", "Name")
+        for row in result:
+            table.add_row(str(row.get("path", "")), str(row.get("name", "")))
+        console.print(table)

sqlcg/cli/commands/find.py CHANGED Viewed

@@ -4,8 +4,8 @@ import typer
 from rich.console import Console
 from rich.table import Table
-from sqlcg.core.config import get_backend
 from sqlcg.core.schema import NodeLabel
+from sqlcg.server.read_client import run_read_routed
 app = typer.Typer(help="Search the graph")
 console = Console()
@@ -18,21 +18,20 @@ def find_table(  # noqa: B008
 ) -> None:
     """Find a table by name."""
     name = name.lower()  # graph keys are lowercased at index time (C2 normalization)
-    with get_backend() as backend:
-        results = backend.run_read(
-            f"MATCH (t:{NodeLabel.TABLE}) WHERE t.qualified CONTAINS $name "
-            "RETURN t.qualified AS qualified, t.kind AS kind LIMIT 50",
-            {"name": name},
-        )
-        if not raw:
-            from sqlcg.server.noise_filter import NoiseFilter
+    results = run_read_routed(
+        f"MATCH (t:{NodeLabel.TABLE}) WHERE t.qualified CONTAINS $name "
+        "RETURN t.qualified AS qualified, t.kind AS kind LIMIT 50",
+        {"name": name},
+    )
+    if not raw:
+        from sqlcg.server.noise_filter import NoiseFilter
-            nf = NoiseFilter.from_config()  # repo_root=None → falls back to Path.cwd()
-            ids = [r["qualified"] for r in results]
-            kept, _ = nf.filter_nodes(ids)
-            kept_set = set(kept)
-            results = [r for r in results if r["qualified"] in kept_set]
-        _print_table(results, ["qualified", "kind"])
+        nf = NoiseFilter.from_config()  # repo_root=None → falls back to Path.cwd()
+        ids = [r["qualified"] for r in results]
+        kept, _ = nf.filter_nodes(ids)
+        kept_set = set(kept)
+        results = [r for r in results if r["qualified"] in kept_set]
+    _print_table(results, ["qualified", "kind"])
 @app.command("column")
@@ -42,18 +41,17 @@ def find_column(  # noqa: B008
 ) -> None:
     """Find a column by table.column reference."""
     ref = ref.lower()  # graph keys are lowercased at index time (C2 normalization)
-    with get_backend() as backend:
-        results = backend.run_read(
-            f"MATCH (c:{NodeLabel.COLUMN}) WHERE c.id CONTAINS $ref RETURN c.id AS id LIMIT 50",
-            {"ref": ref},
-        )
-        if not raw:
-            from sqlcg.server.noise_filter import NoiseFilter
+    results = run_read_routed(
+        f"MATCH (c:{NodeLabel.COLUMN}) WHERE c.id CONTAINS $ref RETURN c.id AS id LIMIT 50",
+        {"ref": ref},
+    )
+    if not raw:
+        from sqlcg.server.noise_filter import NoiseFilter
-            nf = NoiseFilter.from_config()  # repo_root=None → falls back to Path.cwd()
-            # Filter on the schema.table portion of each column id (schema.table.column)
-            results = [r for r in results if not nf.is_noise(r["id"].rsplit(".", 1)[0])]
-        _print_table(results, ["id"])
+        nf = NoiseFilter.from_config()  # repo_root=None → falls back to Path.cwd()
+        # Filter on the schema.table portion of each column id (schema.table.column)
+        results = [r for r in results if not nf.is_noise(r["id"].rsplit(".", 1)[0])]
+    _print_table(results, ["id"])
 @app.command("pattern")
@@ -61,13 +59,12 @@ def find_pattern(  # noqa: B008
     pattern: str = typer.Argument(..., help="SQL pattern to search for"),  # noqa: B008
 ) -> None:
     """Find queries containing a SQL pattern."""
-    with get_backend() as backend:
-        results = backend.run_read(
-            f"MATCH (q:{NodeLabel.QUERY}) WHERE q.sql CONTAINS $pattern "
-            "RETURN q.id AS id, q.kind AS kind LIMIT 50",
-            {"pattern": pattern},
-        )
-        _print_table(results, ["id", "kind"])
+    results = run_read_routed(
+        f"MATCH (q:{NodeLabel.QUERY}) WHERE q.sql CONTAINS $pattern "
+        "RETURN q.id AS id, q.kind AS kind LIMIT 50",
+        {"pattern": pattern},
+    )
+    _print_table(results, ["id", "kind"])
 def _print_table(rows: list[dict], columns: list[str]) -> None:

sqlcg/cli/commands/gain.py CHANGED Viewed

@@ -7,8 +7,8 @@ from pathlib import Path
 import typer
 from rich.console import Console
-from sqlcg.core.config import get_backend
 from sqlcg.metrics import store as metrics_module
+from sqlcg.server.read_client import run_read_routed
 from sqlcg.utils.logging import getLogger
 logger = getLogger(__name__)
@@ -120,19 +120,21 @@ def gain_cmd(
         )
         execute_cypher_ratio = execute_cypher_count / total_calls if total_calls > 0 else 0
-        # Section F: parse quality from graph
+        # Section F: parse quality from graph.
+        # run_read_routed raises typer.Exit (Exception-derived, NOT SystemExit) on
+        # server-busy timeout, so the except-Exception block degrades gracefully
+        # (skips the parse-quality section) instead of crashing gain (WARNING 3).
         parse_quality: dict[str, int] | None = None
         try:
-            with get_backend() as backend:
-                mode_rows = backend.run_read(
-                    "MATCH (q:SqlQuery) RETURN q.parsing_mode AS mode,"
-                    " COUNT(q) AS cnt ORDER BY cnt DESC",
-                    {},
-                )
-                if mode_rows and "mode" in mode_rows[0]:
-                    parse_quality = {str(r["mode"]): int(r["cnt"]) for r in mode_rows}
+            mode_rows = run_read_routed(
+                "MATCH (q:SqlQuery) RETURN q.parsing_mode AS mode,"
+                " COUNT(q) AS cnt ORDER BY cnt DESC",
+                {},
+            )
+            if mode_rows and "mode" in mode_rows[0]:
+                parse_quality = {str(r["mode"]): int(r["cnt"]) for r in mode_rows}
         except Exception:
-            pass  # graph not available — skip quality section
+            pass  # graph not available or server busy — skip quality section
         if json_output:
             payload: dict = {

sqlcg/core/config.py CHANGED Viewed

@@ -346,14 +346,32 @@ def get_external_consumers(path: Path) -> list[ExternalConsumerSpec]:
     return []
-def get_backend() -> "GraphBackend":
+def get_backend(read_only: bool = False) -> "GraphBackend":
     """Get a graph backend instance respecting the SQLCG_BACKEND env var.
+    Args:
+        read_only: Open the database in read-only mode. For KuzuBackend this
+            enables multiple concurrent read-only opens (reader/reader
+            concurrency), but does NOT allow reads while a read-write writer
+            holds the exclusive process lock — that requires routing through the
+            live MCP server via ``read_client.run_read_routed`` (v1.2.0).
+            Ignored for Neo4jBackend (Neo4j has no single-writer process lock;
+            the flag is a no-op and the normal connection is opened).
+            All writer call sites (index, reindex, db init/reset, server
+            init_backend) use the default ``False``.
     Returns:
         A GraphBackend instance (KuzuBackend by default, or Neo4jBackend)
     Raises:
         ValueError: If backend type is not recognized
+    Note:
+        CLI read commands (find, analyze, db info, gain) route through a live
+        MCP server via ``read_client.run_read_routed`` (v1.2.0) when a server
+        is live, falling back to ``get_backend(read_only=True)`` when no server
+        is present. The fallback path still contends for the process lock under
+        an active writer (Windows / no-server fallback only).
     """
     backend_type = os.getenv("SQLCG_BACKEND", "kuzu")
@@ -361,14 +379,26 @@ def get_backend() -> "GraphBackend":
         from sqlcg.core.kuzu_backend import KuzuBackend
         kuzu_cfg = KuzuConfig.from_env()
-        return KuzuBackend(
-            str(kuzu_cfg.db_path),
-            buffer_pool_size_mb=kuzu_cfg.buffer_pool_size_mb,
-        )
+        try:
+            return KuzuBackend(
+                str(kuzu_cfg.db_path),
+                buffer_pool_size_mb=kuzu_cfg.buffer_pool_size_mb,
+                read_only=read_only,
+            )
+        except RuntimeError as exc:
+            if read_only and "READ ONLY" in str(exc):
+                # KùzuDB refuses to open a non-existent or empty DB in read-only
+                # mode ("Cannot create an empty database under READ ONLY mode").
+                # Surface the same empty-DB guidance the user sees from `db info`.
+                raise RuntimeError(
+                    "Database not initialised — run 'sqlcg db init' and 'sqlcg index <path>' first."
+                ) from exc
+            raise
     elif backend_type == "neo4j":
         from sqlcg.core.neo4j_backend import Neo4jBackend
         neo4j_cfg = Neo4jConfig.from_env()
+        # read_only is ignored for Neo4j — no single-writer process lock.
         return Neo4jBackend(neo4j_cfg.uri, neo4j_cfg.user, neo4j_cfg.password)
     else:
         raise ValueError(f"Unknown backend type: {backend_type}")

sqlcg/core/kuzu_backend.py CHANGED Viewed

@@ -58,7 +58,10 @@ class KuzuBackend(GraphBackend):
         Args:
             db_path: Path to the KùzuDB database file (or ':memory:' for in-memory)
             buffer_pool_size_mb: Buffer pool size in MB (0 = use KuzuDB default)
-            read_only: Open in read-only mode (allows concurrent indexing)
+            read_only: Open in read-only mode.  Enables concurrent read-only
+                opens (reader/reader concurrency) by not taking the exclusive
+                write lock.  Does NOT allow reads while a read-write writer
+                holds the lock — KùzuDB's exclusive lock is process-level.
         Raises:
             RuntimeError: If the database is locked or cannot be opened.

sqlcg/core/queries.cypher CHANGED Viewed

@@ -38,12 +38,6 @@ RETURN dst.id AS id, dst.col_name AS col_name, dst.table_qualified AS table_qual
 MATCH (dst:SqlColumn {id: $id})<-[:COLUMN_LINEAGE]-(src:SqlColumn)
 RETURN src.id AS id, src.col_name AS col_name, src.table_qualified AS table_qualified
--- GET_UPSTREAM_DEPENDENCIES_FILTERED
-MATCH (dst:SqlColumn {id: $id})<-[:COLUMN_LINEAGE]-(src:SqlColumn)
-MATCH (t:SqlTable {qualified: src.table_qualified})
-WHERE t.kind IN ['table', 'external']
-RETURN src.id AS id, src.col_name AS col_name, src.table_qualified AS table_qualified
 -- SEARCH_SQL_PATTERN
 MATCH (q:SqlQuery)-[:QUERY_DEFINED_IN]->(f:File)
 WHERE contains(q.sql, $query)

sqlcg/core/queries.py CHANGED Viewed

@@ -28,7 +28,6 @@ TRACE_COLUMN_LINEAGE_QUERY = _Q["TRACE_COLUMN_LINEAGE"]
 FIND_TABLE_USAGES_QUERY = _Q["FIND_TABLE_USAGES"]
 GET_DOWNSTREAM_DEPENDENCIES_QUERY = _Q["GET_DOWNSTREAM_DEPENDENCIES"]
 GET_UPSTREAM_DEPENDENCIES_QUERY = _Q["GET_UPSTREAM_DEPENDENCIES"]
-GET_UPSTREAM_DEPENDENCIES_FILTERED_QUERY = _Q["GET_UPSTREAM_DEPENDENCIES_FILTERED"]
 SEARCH_SQL_PATTERN_QUERY = _Q["SEARCH_SQL_PATTERN"]
 LIST_DIALECTS_AND_REPOS_QUERY = _Q["LIST_DIALECTS_AND_REPOS"]
 EXPAND_STAR_SOURCES_QUERY = _Q["EXPAND_STAR_SOURCES"]

sqlcg/indexer/indexer.py CHANGED Viewed

@@ -93,8 +93,11 @@ def _flush_row_batch(db: GraphBackend, buf: BatchRowBuffer) -> None:
     This is the v1.1.1 batch-flush core: called once per batch (not once per file).
     Dedup keys mirror the graph's MERGE cardinality:
       - file_rows:             path (primary key)
-      - table_rows:            qualified (primary key); prefers row with non-empty
-                               defined_in_file so DEFINED_IN provenance is preserved.
+      - table_rows:            qualified (primary key); prefers (1) row with non-empty
+                               defined_in_file so DEFINED_IN provenance is preserved;
+                               (2) structural kind ('cte','derived','external') over
+                               default 'table' so CTE aliases keep kind='cte' even when
+                               also seen as source references with the default kind.
       - column_rows:           id (primary key)
       - query_rows:            id (primary key, globally unique path:index)
       - edge rows:             (src_key, dst_key) only — matches MERGE (src)-[r]->(dst)
@@ -108,12 +111,27 @@ def _flush_row_batch(db: GraphBackend, buf: BatchRowBuffer) -> None:
     # --- Phase B: batch-scoped dedup ---
     # For table_rows, prefer defined rows (non-empty defined_in_file) so provenance
     # is not lost when a shared table is referenced by multiple files.
+    # Also prefer structurally-assigned kinds ('cte', 'derived', 'external') over the
+    # default 'table' kind: a CTE alias emitted first as a source reference (kind='table'
+    # default) and later confirmed as a CTE destination (kind='cte') must keep 'cte' so
+    # the kind filter correctly excludes it from default filtered output.
+    _structural_kinds = {"cte", "derived", "external"}
     table_dedup: dict[str, dict] = {}
     for r in buf.table_rows:
         key = r["qualified"]
         existing = table_dedup.get(key)
-        if existing is None or (not existing.get("defined_in_file") and r.get("defined_in_file")):
+        if existing is None:
             table_dedup[key] = r
+        else:
+            # Rule 1: prefer rows with defined_in_file (DDL provenance)
+            if not existing.get("defined_in_file") and r.get("defined_in_file"):
+                table_dedup[key] = r
+            # Rule 2: prefer structural kind over default 'table'
+            elif (
+                existing.get("kind", "table") not in _structural_kinds
+                and r.get("kind", "table") in _structural_kinds
+            ):
+                table_dedup[key] = r
     table_rows = list(table_dedup.values())
     column_rows = list({r["id"]: r for r in buf.column_rows}.values())
@@ -432,7 +450,14 @@ class Indexer:
             Note: sqlcg watch's reindex_file uses a separate code path with
             its own short per-file transaction. PERF-BATCH only affects index_repo.
             """
-            self._upsert_file_batch(batch, db, defined_table_registry, nonlocal_counts)
+            self._upsert_file_batch(
+                batch,
+                db,
+                defined_table_registry,
+                nonlocal_counts,
+                canonical_by_bare=aggregator.canonical_by_bare,
+                ambiguous_bare=aggregator._ambiguous_bare,
+            )
         if profile:
             _t_upsert_start = time.perf_counter()
@@ -975,6 +1000,8 @@ class Indexer:
         self,
         parsed: ParsedFile,
         defined_table_registry: dict[str, str] | None = None,
+        canonical_by_bare: dict[str, str] | None = None,
+        ambiguous_bare: set[str] | None = None,
     ) -> FileRowSet:
         """Build all row dicts for one ParsedFile — Phase A (pure, no db access).
@@ -986,6 +1013,13 @@ class Indexer:
         Args:
             parsed: ParsedFile to build rows for
             defined_table_registry: Optional cross-file DDL dedup registry
+            canonical_by_bare: Optional #44 bare-name → canonical full_id index
+                (populated by CrossFileAggregator.register_pass1 from DDL tables).
+                When provided, unqualified INSERT targets whose bare name maps to
+                exactly one DDL canonical are rewritten to use the canonical full_id.
+                When None, the rewrite is skipped (single-file / reindex_file path).
+            ambiguous_bare: Optional set of bare names defined in >1 schema.
+                These are never rewritten — the existing _bare_ref CLI hint handles them.
         Returns:
             FileRowSet with all row lists and per-file counts/quality key
@@ -1126,6 +1160,24 @@ class Indexer:
                         "table_name": edge.src.table.name,
                     }
                 )
+                # Half A (#39): emit a SqlTable node for the source table.
+                # CTE-body-only sources are not in stmt.sources (which only covers
+                # tables reachable via the parser's top-level FROM list), so they were
+                # previously missing from the graph.  edge.src.table is a frozen
+                # TableRef with schema-aliasing already applied at parse time — the
+                # qualified value is guaranteed to match edge.src.table_qualified.
+                # key set is identical to other table_rows entries → upsert_nodes_bulk
+                # homogeneity preserved; MERGE on primary key deduplicates re-emits.
+                rows.table_rows.append(
+                    {
+                        "qualified": edge.src.table.full_id,
+                        "name": edge.src.table.name,
+                        "catalog": edge.src.table.catalog or "",
+                        "db": edge.src.table.db or "",
+                        "kind": edge.src.table.role,
+                        "defined_in_file": "",
+                    }
+                )
                 rows.column_rows.append(
                     {
                         "id": dst_id,
@@ -1183,15 +1235,53 @@ class Indexer:
                 rows.counts["star_sources"] += 1
             # Upsert target table node (if not already a defined_table)
-            # so that star expansion can create destination columns
+            # so that star expansion can create destination columns.
+            # #44: when a canonical_by_bare index is available, attempt to resolve
+            # an unqualified / wrong-schema INSERT target to its DDL-canonical full_id
+            # so that INSERT-target nodes share identity with the DDL node.
             if stmt.target and stmt.target.full_id not in defined_table_ids:
+                target_qualified = stmt.target.full_id
+                target_name = stmt.target.name
+                target_db = stmt.target.db or ""
+                target_catalog = stmt.target.catalog or ""
+                target_kind = "table"
+                # #44 canonical-name resolution: if bare name maps unambiguously to a
+                # DDL-defined table, use the canonical full_id for the emitted row.
+                # Degrading to no-op when canonical_by_bare is None (single-file path).
+                if canonical_by_bare is not None:
+                    bare = (stmt.target.name or "").lower()
+                    if bare and bare not in (ambiguous_bare or set()) and bare in canonical_by_bare:
+                        # Resolve to the sole DDL canonical — keeps kind='table'
+                        canonical_id = canonical_by_bare[bare]
+                        target_qualified = canonical_id
+                        # Derive name/db/catalog from the canonical full_id parts.
+                        # full_id format: "db.name" or "catalog.db.name" or "name"
+                        parts = canonical_id.split(".")
+                        if len(parts) >= 3:
+                            target_name = parts[-1]
+                            target_db = parts[-2]
+                            target_catalog = parts[-3]
+                        elif len(parts) == 2:
+                            target_name = parts[-1]
+                            target_db = parts[-2]
+                            target_catalog = ""
+                        else:
+                            target_name = canonical_id
+                            target_db = ""
+                            target_catalog = ""
+                    else:
+                        # Not resolved to a DDL canonical — mark as derived so the
+                        # kind filter excludes it from default (non-raw) output (#45.2)
+                        target_kind = "derived"
                 rows.table_rows.append(
                     {
-                        "qualified": stmt.target.full_id,
-                        "name": stmt.target.name,
-                        "catalog": stmt.target.catalog or "",
-                        "db": stmt.target.db or "",
-                        "kind": "table",
+                        "qualified": target_qualified,
+                        "name": target_name,
+                        "catalog": target_catalog,
+                        "db": target_db,
+                        "kind": target_kind,
                         "defined_in_file": "",
                     }
                 )
@@ -1205,6 +1295,8 @@ class Indexer:
         defined_table_registry: dict[str, str],
         nonlocal_counts: dict,
         warning_prefix: str = "",
+        canonical_by_bare: dict[str, str] | None = None,
+        ambiguous_bare: set[str] | None = None,
     ) -> None:
         """Accumulate rows for all files in batch, then flush once in one transaction.
@@ -1219,13 +1311,19 @@ class Indexer:
             defined_table_registry: Cross-file DDL dedup registry
             nonlocal_counts: Mutable summary dict updated in place (tables/edges/quality/…)
             warning_prefix: Optional prefix for warning log messages (e.g. "resync_changed: ")
+            canonical_by_bare: Optional #44 bare-name → canonical full_id index.
+                When provided, unqualified INSERT targets are rewritten to their DDL
+                canonical full_id.  None on single-file / reindex_file paths.
+            ambiguous_bare: Optional set of bare names defined in >1 schema.
         """
         if not batch:
             return
         buf = BatchRowBuffer()
         for parsed_in_batch in batch:
             try:
-                file_rows = self._build_file_rows(parsed_in_batch, defined_table_registry)
+                file_rows = self._build_file_rows(
+                    parsed_in_batch, defined_table_registry, canonical_by_bare, ambiguous_bare
+                )
             except Exception as exc:
                 logger.warning(
                     "%sFailed to build rows for %s: %s — skipping",

sql-code-graph 1.1.0__py3-none-any.whl → 1.2.2__py3-none-any.whl

sql-code-graph 1.1.0py3-none-any.whl → 1.2.2py3-none-any.whl