sql-code-graph 1.1.0__py3-none-any.whl → 1.2.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
sqlcg/cli/commands/db.py CHANGED
@@ -10,6 +10,7 @@ from rich.console import Console
10
10
  from sqlcg.core.config import get_backend, get_db_path
11
11
  from sqlcg.core.freshness import compute_freshness, render_freshness_line
12
12
  from sqlcg.core.schema import NodeLabel
13
+ from sqlcg.server.read_client import run_read_routed
13
14
  from sqlcg.utils.logging import getLogger
14
15
 
15
16
  logger = getLogger(__name__)
@@ -75,107 +76,112 @@ def db_reset( # noqa: B008
75
76
  @app.command("info")
76
77
  def db_info() -> None:
77
78
  """Show database stats."""
78
- with get_backend() as backend:
79
- version = backend.get_schema_version() or "unknown"
80
- console.print(f"Schema version: {version}")
81
-
82
- # Freshness block only shown when the DB has been indexed from a git repo
79
+ # db info is a read-only command. All Cypher reads route through the live
80
+ # server (run_read_routed) to avoid "Database is locked" while the MCP server
81
+ # holds the write lock. get_schema_version / get_indexed_sha are inlined as
82
+ # run_read_routed calls using their known Cypher so they too route through the
83
+ # socket when a server is live; this avoids a direct-open that would hit the lock.
84
+
85
+ # Schema version
86
+ schema_rows = run_read_routed("MATCH (v:SchemaVersion) RETURN v.version AS version LIMIT 1", {})
87
+ version = (schema_rows[0]["version"] if schema_rows else None) or "unknown"
88
+ console.print(f"Schema version: {version}")
89
+
90
+ # Freshness block — only shown when the DB has been indexed from a git repo
91
+ try:
92
+ sha_rows = run_read_routed(
93
+ "MATCH (v:SchemaVersion) RETURN v.indexed_sha AS sha LIMIT 1", {}
94
+ )
95
+ indexed_sha = sha_rows[0]["sha"] if sha_rows else None
96
+ repo_rows = run_read_routed("MATCH (r:Repo) RETURN r.path AS path LIMIT 1", {})
97
+ if repo_rows and indexed_sha is not None and repo_rows[0].get("path"):
98
+ repo_root = Path(repo_rows[0]["path"])
99
+ f = compute_freshness(repo_root, indexed_sha)
100
+ console.print(render_freshness_line(f))
101
+ except NotImplementedError:
102
+ # Neo4j backend raises NotImplementedError for get_indexed_sha — skip silently
103
+ pass
104
+ except Exception as e:
105
+ # Any unexpected error in the freshness block must not crash db info
106
+ logger.debug(f"Freshness check skipped: {e}")
107
+
108
+ # Show node counts for all labels
109
+ for label in NodeLabel:
83
110
  try:
84
- indexed_sha = backend.get_indexed_sha()
85
- repo_rows = backend.run_read("MATCH (r:Repo) RETURN r.path AS path LIMIT 1", {})
86
- if repo_rows and indexed_sha is not None and repo_rows[0].get("path"):
87
- repo_root = Path(repo_rows[0]["path"])
88
- f = compute_freshness(repo_root, indexed_sha)
89
- console.print(render_freshness_line(f))
90
- except NotImplementedError:
91
- # Neo4j backend raises NotImplementedError for get_indexed_sha — skip silently
92
- pass
111
+ result = run_read_routed(f"MATCH (n:{label}) RETURN COUNT(*) AS count", {})
112
+ count = result[0]["count"] if result else 0
113
+ console.print(f" {label}: {count}")
93
114
  except Exception as e:
94
- # Any unexpected error in the freshness block must not crash db info
95
- logger.debug(f"Freshness check skipped: {e}")
96
-
97
- # Show node counts for all labels
98
- for label in NodeLabel:
99
- try:
100
- result = backend.run_read(f"MATCH (n:{label}) RETURN COUNT(*) AS count", {})
101
- count = result[0]["count"] if result else 0
102
- console.print(f" {label}: {count}")
103
- except Exception as e:
104
- # Log unexpected exceptions instead of silently skipping
105
- logger.error(f"Error getting count for {label}: {e}")
106
- console.print(f" [red]{label}: error[/red]")
107
-
108
- # Health check section
109
- repo_count_result = backend.run_read("MATCH (n:Repo) RETURN COUNT(n) AS count", {})
110
- repo_count = repo_count_result[0]["count"] if repo_count_result else 0
111
-
112
- if repo_count == 0:
113
- console.print( # noqa: E501
114
- "[red]Database is empty. Run 'sqlcg db init' and 'sqlcg index <path>' first.[/red]"
115
+ # Log unexpected exceptions instead of silently skipping
116
+ logger.error(f"Error getting count for {label}: {e}")
117
+ console.print(f" [red]{label}: error[/red]")
118
+
119
+ # Health check section
120
+ repo_count_result = run_read_routed("MATCH (n:Repo) RETURN COUNT(n) AS count", {})
121
+ repo_count = repo_count_result[0]["count"] if repo_count_result else 0
122
+
123
+ if repo_count == 0:
124
+ console.print( # noqa: E501
125
+ "[red]Database is empty. Run 'sqlcg db init' and 'sqlcg index <path>' first.[/red]"
126
+ )
127
+ else:
128
+ query_count_result = run_read_routed("MATCH (n:SqlQuery) RETURN COUNT(n) AS count", {})
129
+ query_count = query_count_result[0]["count"] if query_count_result else 0
130
+
131
+ if query_count == 0:
132
+ console.print(
133
+ "[yellow]No queries indexed. Run 'sqlcg index <path>' to populate "
134
+ "the graph.[/yellow]"
115
135
  )
116
136
  else:
117
- query_count_result = backend.run_read("MATCH (n:SqlQuery) RETURN COUNT(n) AS count", {})
118
- query_count = query_count_result[0]["count"] if query_count_result else 0
137
+ col_count_result = run_read_routed("MATCH (n:SqlColumn) RETURN COUNT(n) AS count", {})
138
+ col_count = col_count_result[0]["count"] if col_count_result else 0
119
139
 
120
- if query_count == 0:
140
+ if col_count == 0:
121
141
  console.print(
122
- "[yellow]No queries indexed. Run 'sqlcg index <path>' to populate "
123
- "the graph.[/yellow]"
142
+ "[yellow]Column lineage not available. Tools trace_column_lineage, "
143
+ "get_downstream_dependencies, and get_upstream_dependencies "
144
+ "will return empty results.[/yellow]"
124
145
  )
125
- else:
126
- col_count_result = backend.run_read(
127
- "MATCH (n:SqlColumn) RETURN COUNT(n) AS count", {}
128
- )
129
- col_count = col_count_result[0]["count"] if col_count_result else 0
130
-
131
- if col_count == 0:
132
- console.print(
133
- "[yellow]Column lineage not available. Tools trace_column_lineage, "
134
- "get_downstream_dependencies, and get_upstream_dependencies "
135
- "will return empty results.[/yellow]"
136
- )
137
-
138
- # Print COLUMN_LINEAGE edges count
139
- edges_result = backend.run_read(
140
- "MATCH ()-[r:COLUMN_LINEAGE]->() RETURN COUNT(r) AS count", {}
141
- )
142
- edges_count = edges_result[0]["count"] if edges_result else 0
143
- console.print(f" COLUMN_LINEAGE edges: {edges_count}")
144
146
 
145
- # Print star resolution metrics (T-07)
146
- from sqlcg.core.queries import COUNT_STAR_EXPANSIONS_QUERY, COUNT_STAR_SOURCES_QUERY
147
+ # Print COLUMN_LINEAGE edges count
148
+ edges_result = run_read_routed("MATCH ()-[r:COLUMN_LINEAGE]->() RETURN COUNT(r) AS count", {})
149
+ edges_count = edges_result[0]["count"] if edges_result else 0
150
+ console.print(f" COLUMN_LINEAGE edges: {edges_count}")
147
151
 
148
- star_source_result = backend.run_read(COUNT_STAR_SOURCES_QUERY, {})
149
- star_source_count = star_source_result[0]["n"] if star_source_result else 0
150
- console.print(f" STAR_SOURCE edges: {star_source_count}")
152
+ # Print star resolution metrics (T-07)
153
+ from sqlcg.core.queries import COUNT_STAR_EXPANSIONS_QUERY, COUNT_STAR_SOURCES_QUERY
151
154
 
152
- star_expansion_result = backend.run_read(COUNT_STAR_EXPANSIONS_QUERY, {})
153
- star_expansion_count = star_expansion_result[0]["n"] if star_expansion_result else 0
154
- console.print(f" STAR_EXPANSION lineage edges: {star_expansion_count}")
155
+ star_source_result = run_read_routed(COUNT_STAR_SOURCES_QUERY, {})
156
+ star_source_count = star_source_result[0]["n"] if star_source_result else 0
157
+ console.print(f" STAR_SOURCE edges: {star_source_count}")
155
158
 
156
- # Print parsing mode distribution
157
- mode_query = (
158
- "MATCH (q:SqlQuery) RETURN q.parsing_mode AS mode, COUNT(q) AS cnt ORDER BY cnt DESC"
159
- )
160
- mode_rows = backend.run_read(mode_query, {})
161
- if mode_rows and "mode" in mode_rows[0]:
162
- console.print("\n Parsing mode distribution:")
163
- for row in mode_rows:
164
- console.print(f" {row['mode']}: {row['cnt']}")
159
+ star_expansion_result = run_read_routed(COUNT_STAR_EXPANSIONS_QUERY, {})
160
+ star_expansion_count = star_expansion_result[0]["n"] if star_expansion_result else 0
161
+ console.print(f" STAR_EXPANSION lineage edges: {star_expansion_count}")
162
+
163
+ # Print parsing mode distribution
164
+ mode_query = (
165
+ "MATCH (q:SqlQuery) RETURN q.parsing_mode AS mode, COUNT(q) AS cnt ORDER BY cnt DESC"
166
+ )
167
+ mode_rows = run_read_routed(mode_query, {})
168
+ if mode_rows and "mode" in mode_rows[0]:
169
+ console.print("\n Parsing mode distribution:")
170
+ for row in mode_rows:
171
+ console.print(f" {row['mode']}: {row['cnt']}")
165
172
 
166
173
 
167
174
  @app.command("list-repos")
168
175
  def list_repos() -> None:
169
176
  """List all indexed repositories."""
170
- with get_backend() as backend:
171
- result = backend.run_read("MATCH (r:Repo) RETURN r.path AS path, r.name AS name", {})
177
+ result = run_read_routed("MATCH (r:Repo) RETURN r.path AS path, r.name AS name", {})
172
178
 
173
- if not result:
174
- console.print("[yellow]No repositories indexed[/yellow]")
175
- else:
176
- from rich.table import Table
179
+ if not result:
180
+ console.print("[yellow]No repositories indexed[/yellow]")
181
+ else:
182
+ from rich.table import Table
177
183
 
178
- table = Table("Path", "Name")
179
- for row in result:
180
- table.add_row(str(row.get("path", "")), str(row.get("name", "")))
181
- console.print(table)
184
+ table = Table("Path", "Name")
185
+ for row in result:
186
+ table.add_row(str(row.get("path", "")), str(row.get("name", "")))
187
+ console.print(table)
@@ -4,8 +4,8 @@ import typer
4
4
  from rich.console import Console
5
5
  from rich.table import Table
6
6
 
7
- from sqlcg.core.config import get_backend
8
7
  from sqlcg.core.schema import NodeLabel
8
+ from sqlcg.server.read_client import run_read_routed
9
9
 
10
10
  app = typer.Typer(help="Search the graph")
11
11
  console = Console()
@@ -18,21 +18,20 @@ def find_table( # noqa: B008
18
18
  ) -> None:
19
19
  """Find a table by name."""
20
20
  name = name.lower() # graph keys are lowercased at index time (C2 normalization)
21
- with get_backend() as backend:
22
- results = backend.run_read(
23
- f"MATCH (t:{NodeLabel.TABLE}) WHERE t.qualified CONTAINS $name "
24
- "RETURN t.qualified AS qualified, t.kind AS kind LIMIT 50",
25
- {"name": name},
26
- )
27
- if not raw:
28
- from sqlcg.server.noise_filter import NoiseFilter
21
+ results = run_read_routed(
22
+ f"MATCH (t:{NodeLabel.TABLE}) WHERE t.qualified CONTAINS $name "
23
+ "RETURN t.qualified AS qualified, t.kind AS kind LIMIT 50",
24
+ {"name": name},
25
+ )
26
+ if not raw:
27
+ from sqlcg.server.noise_filter import NoiseFilter
29
28
 
30
- nf = NoiseFilter.from_config() # repo_root=None → falls back to Path.cwd()
31
- ids = [r["qualified"] for r in results]
32
- kept, _ = nf.filter_nodes(ids)
33
- kept_set = set(kept)
34
- results = [r for r in results if r["qualified"] in kept_set]
35
- _print_table(results, ["qualified", "kind"])
29
+ nf = NoiseFilter.from_config() # repo_root=None → falls back to Path.cwd()
30
+ ids = [r["qualified"] for r in results]
31
+ kept, _ = nf.filter_nodes(ids)
32
+ kept_set = set(kept)
33
+ results = [r for r in results if r["qualified"] in kept_set]
34
+ _print_table(results, ["qualified", "kind"])
36
35
 
37
36
 
38
37
  @app.command("column")
@@ -42,18 +41,17 @@ def find_column( # noqa: B008
42
41
  ) -> None:
43
42
  """Find a column by table.column reference."""
44
43
  ref = ref.lower() # graph keys are lowercased at index time (C2 normalization)
45
- with get_backend() as backend:
46
- results = backend.run_read(
47
- f"MATCH (c:{NodeLabel.COLUMN}) WHERE c.id CONTAINS $ref RETURN c.id AS id LIMIT 50",
48
- {"ref": ref},
49
- )
50
- if not raw:
51
- from sqlcg.server.noise_filter import NoiseFilter
44
+ results = run_read_routed(
45
+ f"MATCH (c:{NodeLabel.COLUMN}) WHERE c.id CONTAINS $ref RETURN c.id AS id LIMIT 50",
46
+ {"ref": ref},
47
+ )
48
+ if not raw:
49
+ from sqlcg.server.noise_filter import NoiseFilter
52
50
 
53
- nf = NoiseFilter.from_config() # repo_root=None → falls back to Path.cwd()
54
- # Filter on the schema.table portion of each column id (schema.table.column)
55
- results = [r for r in results if not nf.is_noise(r["id"].rsplit(".", 1)[0])]
56
- _print_table(results, ["id"])
51
+ nf = NoiseFilter.from_config() # repo_root=None → falls back to Path.cwd()
52
+ # Filter on the schema.table portion of each column id (schema.table.column)
53
+ results = [r for r in results if not nf.is_noise(r["id"].rsplit(".", 1)[0])]
54
+ _print_table(results, ["id"])
57
55
 
58
56
 
59
57
  @app.command("pattern")
@@ -61,13 +59,12 @@ def find_pattern( # noqa: B008
61
59
  pattern: str = typer.Argument(..., help="SQL pattern to search for"), # noqa: B008
62
60
  ) -> None:
63
61
  """Find queries containing a SQL pattern."""
64
- with get_backend() as backend:
65
- results = backend.run_read(
66
- f"MATCH (q:{NodeLabel.QUERY}) WHERE q.sql CONTAINS $pattern "
67
- "RETURN q.id AS id, q.kind AS kind LIMIT 50",
68
- {"pattern": pattern},
69
- )
70
- _print_table(results, ["id", "kind"])
62
+ results = run_read_routed(
63
+ f"MATCH (q:{NodeLabel.QUERY}) WHERE q.sql CONTAINS $pattern "
64
+ "RETURN q.id AS id, q.kind AS kind LIMIT 50",
65
+ {"pattern": pattern},
66
+ )
67
+ _print_table(results, ["id", "kind"])
71
68
 
72
69
 
73
70
  def _print_table(rows: list[dict], columns: list[str]) -> None:
@@ -7,8 +7,8 @@ from pathlib import Path
7
7
  import typer
8
8
  from rich.console import Console
9
9
 
10
- from sqlcg.core.config import get_backend
11
10
  from sqlcg.metrics import store as metrics_module
11
+ from sqlcg.server.read_client import run_read_routed
12
12
  from sqlcg.utils.logging import getLogger
13
13
 
14
14
  logger = getLogger(__name__)
@@ -120,19 +120,21 @@ def gain_cmd(
120
120
  )
121
121
  execute_cypher_ratio = execute_cypher_count / total_calls if total_calls > 0 else 0
122
122
 
123
- # Section F: parse quality from graph
123
+ # Section F: parse quality from graph.
124
+ # run_read_routed raises typer.Exit (Exception-derived, NOT SystemExit) on
125
+ # server-busy timeout, so the except-Exception block degrades gracefully
126
+ # (skips the parse-quality section) instead of crashing gain (WARNING 3).
124
127
  parse_quality: dict[str, int] | None = None
125
128
  try:
126
- with get_backend() as backend:
127
- mode_rows = backend.run_read(
128
- "MATCH (q:SqlQuery) RETURN q.parsing_mode AS mode,"
129
- " COUNT(q) AS cnt ORDER BY cnt DESC",
130
- {},
131
- )
132
- if mode_rows and "mode" in mode_rows[0]:
133
- parse_quality = {str(r["mode"]): int(r["cnt"]) for r in mode_rows}
129
+ mode_rows = run_read_routed(
130
+ "MATCH (q:SqlQuery) RETURN q.parsing_mode AS mode,"
131
+ " COUNT(q) AS cnt ORDER BY cnt DESC",
132
+ {},
133
+ )
134
+ if mode_rows and "mode" in mode_rows[0]:
135
+ parse_quality = {str(r["mode"]): int(r["cnt"]) for r in mode_rows}
134
136
  except Exception:
135
- pass # graph not available — skip quality section
137
+ pass # graph not available or server busy — skip quality section
136
138
 
137
139
  if json_output:
138
140
  payload: dict = {
sqlcg/core/config.py CHANGED
@@ -346,14 +346,32 @@ def get_external_consumers(path: Path) -> list[ExternalConsumerSpec]:
346
346
  return []
347
347
 
348
348
 
349
- def get_backend() -> "GraphBackend":
349
+ def get_backend(read_only: bool = False) -> "GraphBackend":
350
350
  """Get a graph backend instance respecting the SQLCG_BACKEND env var.
351
351
 
352
+ Args:
353
+ read_only: Open the database in read-only mode. For KuzuBackend this
354
+ enables multiple concurrent read-only opens (reader/reader
355
+ concurrency), but does NOT allow reads while a read-write writer
356
+ holds the exclusive process lock — that requires routing through the
357
+ live MCP server via ``read_client.run_read_routed`` (v1.2.0).
358
+ Ignored for Neo4jBackend (Neo4j has no single-writer process lock;
359
+ the flag is a no-op and the normal connection is opened).
360
+ All writer call sites (index, reindex, db init/reset, server
361
+ init_backend) use the default ``False``.
362
+
352
363
  Returns:
353
364
  A GraphBackend instance (KuzuBackend by default, or Neo4jBackend)
354
365
 
355
366
  Raises:
356
367
  ValueError: If backend type is not recognized
368
+
369
+ Note:
370
+ CLI read commands (find, analyze, db info, gain) route through a live
371
+ MCP server via ``read_client.run_read_routed`` (v1.2.0) when a server
372
+ is live, falling back to ``get_backend(read_only=True)`` when no server
373
+ is present. The fallback path still contends for the process lock under
374
+ an active writer (Windows / no-server fallback only).
357
375
  """
358
376
  backend_type = os.getenv("SQLCG_BACKEND", "kuzu")
359
377
 
@@ -361,14 +379,26 @@ def get_backend() -> "GraphBackend":
361
379
  from sqlcg.core.kuzu_backend import KuzuBackend
362
380
 
363
381
  kuzu_cfg = KuzuConfig.from_env()
364
- return KuzuBackend(
365
- str(kuzu_cfg.db_path),
366
- buffer_pool_size_mb=kuzu_cfg.buffer_pool_size_mb,
367
- )
382
+ try:
383
+ return KuzuBackend(
384
+ str(kuzu_cfg.db_path),
385
+ buffer_pool_size_mb=kuzu_cfg.buffer_pool_size_mb,
386
+ read_only=read_only,
387
+ )
388
+ except RuntimeError as exc:
389
+ if read_only and "READ ONLY" in str(exc):
390
+ # KùzuDB refuses to open a non-existent or empty DB in read-only
391
+ # mode ("Cannot create an empty database under READ ONLY mode").
392
+ # Surface the same empty-DB guidance the user sees from `db info`.
393
+ raise RuntimeError(
394
+ "Database not initialised — run 'sqlcg db init' and 'sqlcg index <path>' first."
395
+ ) from exc
396
+ raise
368
397
  elif backend_type == "neo4j":
369
398
  from sqlcg.core.neo4j_backend import Neo4jBackend
370
399
 
371
400
  neo4j_cfg = Neo4jConfig.from_env()
401
+ # read_only is ignored for Neo4j — no single-writer process lock.
372
402
  return Neo4jBackend(neo4j_cfg.uri, neo4j_cfg.user, neo4j_cfg.password)
373
403
  else:
374
404
  raise ValueError(f"Unknown backend type: {backend_type}")
@@ -58,7 +58,10 @@ class KuzuBackend(GraphBackend):
58
58
  Args:
59
59
  db_path: Path to the KùzuDB database file (or ':memory:' for in-memory)
60
60
  buffer_pool_size_mb: Buffer pool size in MB (0 = use KuzuDB default)
61
- read_only: Open in read-only mode (allows concurrent indexing)
61
+ read_only: Open in read-only mode. Enables concurrent read-only
62
+ opens (reader/reader concurrency) by not taking the exclusive
63
+ write lock. Does NOT allow reads while a read-write writer
64
+ holds the lock — KùzuDB's exclusive lock is process-level.
62
65
 
63
66
  Raises:
64
67
  RuntimeError: If the database is locked or cannot be opened.
sqlcg/core/queries.cypher CHANGED
@@ -38,12 +38,6 @@ RETURN dst.id AS id, dst.col_name AS col_name, dst.table_qualified AS table_qual
38
38
  MATCH (dst:SqlColumn {id: $id})<-[:COLUMN_LINEAGE]-(src:SqlColumn)
39
39
  RETURN src.id AS id, src.col_name AS col_name, src.table_qualified AS table_qualified
40
40
 
41
- -- GET_UPSTREAM_DEPENDENCIES_FILTERED
42
- MATCH (dst:SqlColumn {id: $id})<-[:COLUMN_LINEAGE]-(src:SqlColumn)
43
- MATCH (t:SqlTable {qualified: src.table_qualified})
44
- WHERE t.kind IN ['table', 'external']
45
- RETURN src.id AS id, src.col_name AS col_name, src.table_qualified AS table_qualified
46
-
47
41
  -- SEARCH_SQL_PATTERN
48
42
  MATCH (q:SqlQuery)-[:QUERY_DEFINED_IN]->(f:File)
49
43
  WHERE contains(q.sql, $query)
sqlcg/core/queries.py CHANGED
@@ -28,7 +28,6 @@ TRACE_COLUMN_LINEAGE_QUERY = _Q["TRACE_COLUMN_LINEAGE"]
28
28
  FIND_TABLE_USAGES_QUERY = _Q["FIND_TABLE_USAGES"]
29
29
  GET_DOWNSTREAM_DEPENDENCIES_QUERY = _Q["GET_DOWNSTREAM_DEPENDENCIES"]
30
30
  GET_UPSTREAM_DEPENDENCIES_QUERY = _Q["GET_UPSTREAM_DEPENDENCIES"]
31
- GET_UPSTREAM_DEPENDENCIES_FILTERED_QUERY = _Q["GET_UPSTREAM_DEPENDENCIES_FILTERED"]
32
31
  SEARCH_SQL_PATTERN_QUERY = _Q["SEARCH_SQL_PATTERN"]
33
32
  LIST_DIALECTS_AND_REPOS_QUERY = _Q["LIST_DIALECTS_AND_REPOS"]
34
33
  EXPAND_STAR_SOURCES_QUERY = _Q["EXPAND_STAR_SOURCES"]
sqlcg/indexer/indexer.py CHANGED
@@ -93,8 +93,11 @@ def _flush_row_batch(db: GraphBackend, buf: BatchRowBuffer) -> None:
93
93
  This is the v1.1.1 batch-flush core: called once per batch (not once per file).
94
94
  Dedup keys mirror the graph's MERGE cardinality:
95
95
  - file_rows: path (primary key)
96
- - table_rows: qualified (primary key); prefers row with non-empty
97
- defined_in_file so DEFINED_IN provenance is preserved.
96
+ - table_rows: qualified (primary key); prefers (1) row with non-empty
97
+ defined_in_file so DEFINED_IN provenance is preserved;
98
+ (2) structural kind ('cte','derived','external') over
99
+ default 'table' so CTE aliases keep kind='cte' even when
100
+ also seen as source references with the default kind.
98
101
  - column_rows: id (primary key)
99
102
  - query_rows: id (primary key, globally unique path:index)
100
103
  - edge rows: (src_key, dst_key) only — matches MERGE (src)-[r]->(dst)
@@ -108,12 +111,27 @@ def _flush_row_batch(db: GraphBackend, buf: BatchRowBuffer) -> None:
108
111
  # --- Phase B: batch-scoped dedup ---
109
112
  # For table_rows, prefer defined rows (non-empty defined_in_file) so provenance
110
113
  # is not lost when a shared table is referenced by multiple files.
114
+ # Also prefer structurally-assigned kinds ('cte', 'derived', 'external') over the
115
+ # default 'table' kind: a CTE alias emitted first as a source reference (kind='table'
116
+ # default) and later confirmed as a CTE destination (kind='cte') must keep 'cte' so
117
+ # the kind filter correctly excludes it from default filtered output.
118
+ _structural_kinds = {"cte", "derived", "external"}
111
119
  table_dedup: dict[str, dict] = {}
112
120
  for r in buf.table_rows:
113
121
  key = r["qualified"]
114
122
  existing = table_dedup.get(key)
115
- if existing is None or (not existing.get("defined_in_file") and r.get("defined_in_file")):
123
+ if existing is None:
116
124
  table_dedup[key] = r
125
+ else:
126
+ # Rule 1: prefer rows with defined_in_file (DDL provenance)
127
+ if not existing.get("defined_in_file") and r.get("defined_in_file"):
128
+ table_dedup[key] = r
129
+ # Rule 2: prefer structural kind over default 'table'
130
+ elif (
131
+ existing.get("kind", "table") not in _structural_kinds
132
+ and r.get("kind", "table") in _structural_kinds
133
+ ):
134
+ table_dedup[key] = r
117
135
  table_rows = list(table_dedup.values())
118
136
 
119
137
  column_rows = list({r["id"]: r for r in buf.column_rows}.values())
@@ -432,7 +450,14 @@ class Indexer:
432
450
  Note: sqlcg watch's reindex_file uses a separate code path with
433
451
  its own short per-file transaction. PERF-BATCH only affects index_repo.
434
452
  """
435
- self._upsert_file_batch(batch, db, defined_table_registry, nonlocal_counts)
453
+ self._upsert_file_batch(
454
+ batch,
455
+ db,
456
+ defined_table_registry,
457
+ nonlocal_counts,
458
+ canonical_by_bare=aggregator.canonical_by_bare,
459
+ ambiguous_bare=aggregator._ambiguous_bare,
460
+ )
436
461
 
437
462
  if profile:
438
463
  _t_upsert_start = time.perf_counter()
@@ -975,6 +1000,8 @@ class Indexer:
975
1000
  self,
976
1001
  parsed: ParsedFile,
977
1002
  defined_table_registry: dict[str, str] | None = None,
1003
+ canonical_by_bare: dict[str, str] | None = None,
1004
+ ambiguous_bare: set[str] | None = None,
978
1005
  ) -> FileRowSet:
979
1006
  """Build all row dicts for one ParsedFile — Phase A (pure, no db access).
980
1007
 
@@ -986,6 +1013,13 @@ class Indexer:
986
1013
  Args:
987
1014
  parsed: ParsedFile to build rows for
988
1015
  defined_table_registry: Optional cross-file DDL dedup registry
1016
+ canonical_by_bare: Optional #44 bare-name → canonical full_id index
1017
+ (populated by CrossFileAggregator.register_pass1 from DDL tables).
1018
+ When provided, unqualified INSERT targets whose bare name maps to
1019
+ exactly one DDL canonical are rewritten to use the canonical full_id.
1020
+ When None, the rewrite is skipped (single-file / reindex_file path).
1021
+ ambiguous_bare: Optional set of bare names defined in >1 schema.
1022
+ These are never rewritten — the existing _bare_ref CLI hint handles them.
989
1023
 
990
1024
  Returns:
991
1025
  FileRowSet with all row lists and per-file counts/quality key
@@ -1126,6 +1160,24 @@ class Indexer:
1126
1160
  "table_name": edge.src.table.name,
1127
1161
  }
1128
1162
  )
1163
+ # Half A (#39): emit a SqlTable node for the source table.
1164
+ # CTE-body-only sources are not in stmt.sources (which only covers
1165
+ # tables reachable via the parser's top-level FROM list), so they were
1166
+ # previously missing from the graph. edge.src.table is a frozen
1167
+ # TableRef with schema-aliasing already applied at parse time — the
1168
+ # qualified value is guaranteed to match edge.src.table_qualified.
1169
+ # key set is identical to other table_rows entries → upsert_nodes_bulk
1170
+ # homogeneity preserved; MERGE on primary key deduplicates re-emits.
1171
+ rows.table_rows.append(
1172
+ {
1173
+ "qualified": edge.src.table.full_id,
1174
+ "name": edge.src.table.name,
1175
+ "catalog": edge.src.table.catalog or "",
1176
+ "db": edge.src.table.db or "",
1177
+ "kind": edge.src.table.role,
1178
+ "defined_in_file": "",
1179
+ }
1180
+ )
1129
1181
  rows.column_rows.append(
1130
1182
  {
1131
1183
  "id": dst_id,
@@ -1183,15 +1235,53 @@ class Indexer:
1183
1235
  rows.counts["star_sources"] += 1
1184
1236
 
1185
1237
  # Upsert target table node (if not already a defined_table)
1186
- # so that star expansion can create destination columns
1238
+ # so that star expansion can create destination columns.
1239
+ # #44: when a canonical_by_bare index is available, attempt to resolve
1240
+ # an unqualified / wrong-schema INSERT target to its DDL-canonical full_id
1241
+ # so that INSERT-target nodes share identity with the DDL node.
1187
1242
  if stmt.target and stmt.target.full_id not in defined_table_ids:
1243
+ target_qualified = stmt.target.full_id
1244
+ target_name = stmt.target.name
1245
+ target_db = stmt.target.db or ""
1246
+ target_catalog = stmt.target.catalog or ""
1247
+ target_kind = "table"
1248
+
1249
+ # #44 canonical-name resolution: if bare name maps unambiguously to a
1250
+ # DDL-defined table, use the canonical full_id for the emitted row.
1251
+ # Degrading to no-op when canonical_by_bare is None (single-file path).
1252
+ if canonical_by_bare is not None:
1253
+ bare = (stmt.target.name or "").lower()
1254
+ if bare and bare not in (ambiguous_bare or set()) and bare in canonical_by_bare:
1255
+ # Resolve to the sole DDL canonical — keeps kind='table'
1256
+ canonical_id = canonical_by_bare[bare]
1257
+ target_qualified = canonical_id
1258
+ # Derive name/db/catalog from the canonical full_id parts.
1259
+ # full_id format: "db.name" or "catalog.db.name" or "name"
1260
+ parts = canonical_id.split(".")
1261
+ if len(parts) >= 3:
1262
+ target_name = parts[-1]
1263
+ target_db = parts[-2]
1264
+ target_catalog = parts[-3]
1265
+ elif len(parts) == 2:
1266
+ target_name = parts[-1]
1267
+ target_db = parts[-2]
1268
+ target_catalog = ""
1269
+ else:
1270
+ target_name = canonical_id
1271
+ target_db = ""
1272
+ target_catalog = ""
1273
+ else:
1274
+ # Not resolved to a DDL canonical — mark as derived so the
1275
+ # kind filter excludes it from default (non-raw) output (#45.2)
1276
+ target_kind = "derived"
1277
+
1188
1278
  rows.table_rows.append(
1189
1279
  {
1190
- "qualified": stmt.target.full_id,
1191
- "name": stmt.target.name,
1192
- "catalog": stmt.target.catalog or "",
1193
- "db": stmt.target.db or "",
1194
- "kind": "table",
1280
+ "qualified": target_qualified,
1281
+ "name": target_name,
1282
+ "catalog": target_catalog,
1283
+ "db": target_db,
1284
+ "kind": target_kind,
1195
1285
  "defined_in_file": "",
1196
1286
  }
1197
1287
  )
@@ -1205,6 +1295,8 @@ class Indexer:
1205
1295
  defined_table_registry: dict[str, str],
1206
1296
  nonlocal_counts: dict,
1207
1297
  warning_prefix: str = "",
1298
+ canonical_by_bare: dict[str, str] | None = None,
1299
+ ambiguous_bare: set[str] | None = None,
1208
1300
  ) -> None:
1209
1301
  """Accumulate rows for all files in batch, then flush once in one transaction.
1210
1302
 
@@ -1219,13 +1311,19 @@ class Indexer:
1219
1311
  defined_table_registry: Cross-file DDL dedup registry
1220
1312
  nonlocal_counts: Mutable summary dict updated in place (tables/edges/quality/…)
1221
1313
  warning_prefix: Optional prefix for warning log messages (e.g. "resync_changed: ")
1314
+ canonical_by_bare: Optional #44 bare-name → canonical full_id index.
1315
+ When provided, unqualified INSERT targets are rewritten to their DDL
1316
+ canonical full_id. None on single-file / reindex_file paths.
1317
+ ambiguous_bare: Optional set of bare names defined in >1 schema.
1222
1318
  """
1223
1319
  if not batch:
1224
1320
  return
1225
1321
  buf = BatchRowBuffer()
1226
1322
  for parsed_in_batch in batch:
1227
1323
  try:
1228
- file_rows = self._build_file_rows(parsed_in_batch, defined_table_registry)
1324
+ file_rows = self._build_file_rows(
1325
+ parsed_in_batch, defined_table_registry, canonical_by_bare, ambiguous_bare
1326
+ )
1229
1327
  except Exception as exc:
1230
1328
  logger.warning(
1231
1329
  "%sFailed to build rows for %s: %s — skipping",