sql-code-graph 1.1.3__py3-none-any.whl → 1.2.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {sql_code_graph-1.1.3.dist-info → sql_code_graph-1.2.2.dist-info}/METADATA +11 -1
- {sql_code_graph-1.1.3.dist-info → sql_code_graph-1.2.2.dist-info}/RECORD +15 -14
- {sql_code_graph-1.1.3.dist-info → sql_code_graph-1.2.2.dist-info}/WHEEL +1 -1
- sqlcg/__init__.py +1 -1
- sqlcg/cli/commands/analyze.py +151 -149
- sqlcg/cli/commands/db.py +92 -86
- sqlcg/cli/commands/find.py +30 -33
- sqlcg/cli/commands/gain.py +13 -11
- sqlcg/core/config.py +15 -13
- sqlcg/indexer/indexer.py +91 -11
- sqlcg/lineage/aggregator.py +17 -45
- sqlcg/parsers/ansi_parser.py +2 -2
- sqlcg/server/read_client.py +192 -0
- sqlcg/server/server.py +97 -18
- {sql_code_graph-1.1.3.dist-info → sql_code_graph-1.2.2.dist-info}/entry_points.txt +0 -0
sqlcg/cli/commands/db.py
CHANGED
|
@@ -10,6 +10,7 @@ from rich.console import Console
|
|
|
10
10
|
from sqlcg.core.config import get_backend, get_db_path
|
|
11
11
|
from sqlcg.core.freshness import compute_freshness, render_freshness_line
|
|
12
12
|
from sqlcg.core.schema import NodeLabel
|
|
13
|
+
from sqlcg.server.read_client import run_read_routed
|
|
13
14
|
from sqlcg.utils.logging import getLogger
|
|
14
15
|
|
|
15
16
|
logger = getLogger(__name__)
|
|
@@ -75,107 +76,112 @@ def db_reset( # noqa: B008
|
|
|
75
76
|
@app.command("info")
|
|
76
77
|
def db_info() -> None:
|
|
77
78
|
"""Show database stats."""
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
79
|
+
# db info is a read-only command. All Cypher reads route through the live
|
|
80
|
+
# server (run_read_routed) to avoid "Database is locked" while the MCP server
|
|
81
|
+
# holds the write lock. get_schema_version / get_indexed_sha are inlined as
|
|
82
|
+
# run_read_routed calls using their known Cypher so they too route through the
|
|
83
|
+
# socket when a server is live; this avoids a direct-open that would hit the lock.
|
|
84
|
+
|
|
85
|
+
# Schema version
|
|
86
|
+
schema_rows = run_read_routed("MATCH (v:SchemaVersion) RETURN v.version AS version LIMIT 1", {})
|
|
87
|
+
version = (schema_rows[0]["version"] if schema_rows else None) or "unknown"
|
|
88
|
+
console.print(f"Schema version: {version}")
|
|
89
|
+
|
|
90
|
+
# Freshness block — only shown when the DB has been indexed from a git repo
|
|
91
|
+
try:
|
|
92
|
+
sha_rows = run_read_routed(
|
|
93
|
+
"MATCH (v:SchemaVersion) RETURN v.indexed_sha AS sha LIMIT 1", {}
|
|
94
|
+
)
|
|
95
|
+
indexed_sha = sha_rows[0]["sha"] if sha_rows else None
|
|
96
|
+
repo_rows = run_read_routed("MATCH (r:Repo) RETURN r.path AS path LIMIT 1", {})
|
|
97
|
+
if repo_rows and indexed_sha is not None and repo_rows[0].get("path"):
|
|
98
|
+
repo_root = Path(repo_rows[0]["path"])
|
|
99
|
+
f = compute_freshness(repo_root, indexed_sha)
|
|
100
|
+
console.print(render_freshness_line(f))
|
|
101
|
+
except NotImplementedError:
|
|
102
|
+
# Neo4j backend raises NotImplementedError for get_indexed_sha — skip silently
|
|
103
|
+
pass
|
|
104
|
+
except Exception as e:
|
|
105
|
+
# Any unexpected error in the freshness block must not crash db info
|
|
106
|
+
logger.debug(f"Freshness check skipped: {e}")
|
|
107
|
+
|
|
108
|
+
# Show node counts for all labels
|
|
109
|
+
for label in NodeLabel:
|
|
83
110
|
try:
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
repo_root = Path(repo_rows[0]["path"])
|
|
88
|
-
f = compute_freshness(repo_root, indexed_sha)
|
|
89
|
-
console.print(render_freshness_line(f))
|
|
90
|
-
except NotImplementedError:
|
|
91
|
-
# Neo4j backend raises NotImplementedError for get_indexed_sha — skip silently
|
|
92
|
-
pass
|
|
111
|
+
result = run_read_routed(f"MATCH (n:{label}) RETURN COUNT(*) AS count", {})
|
|
112
|
+
count = result[0]["count"] if result else 0
|
|
113
|
+
console.print(f" {label}: {count}")
|
|
93
114
|
except Exception as e:
|
|
94
|
-
#
|
|
95
|
-
logger.
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
"[red]Database is empty. Run 'sqlcg db init' and 'sqlcg index <path>' first.[/red]"
|
|
115
|
+
# Log unexpected exceptions instead of silently skipping
|
|
116
|
+
logger.error(f"Error getting count for {label}: {e}")
|
|
117
|
+
console.print(f" [red]{label}: error[/red]")
|
|
118
|
+
|
|
119
|
+
# Health check section
|
|
120
|
+
repo_count_result = run_read_routed("MATCH (n:Repo) RETURN COUNT(n) AS count", {})
|
|
121
|
+
repo_count = repo_count_result[0]["count"] if repo_count_result else 0
|
|
122
|
+
|
|
123
|
+
if repo_count == 0:
|
|
124
|
+
console.print( # noqa: E501
|
|
125
|
+
"[red]Database is empty. Run 'sqlcg db init' and 'sqlcg index <path>' first.[/red]"
|
|
126
|
+
)
|
|
127
|
+
else:
|
|
128
|
+
query_count_result = run_read_routed("MATCH (n:SqlQuery) RETURN COUNT(n) AS count", {})
|
|
129
|
+
query_count = query_count_result[0]["count"] if query_count_result else 0
|
|
130
|
+
|
|
131
|
+
if query_count == 0:
|
|
132
|
+
console.print(
|
|
133
|
+
"[yellow]No queries indexed. Run 'sqlcg index <path>' to populate "
|
|
134
|
+
"the graph.[/yellow]"
|
|
115
135
|
)
|
|
116
136
|
else:
|
|
117
|
-
|
|
118
|
-
|
|
137
|
+
col_count_result = run_read_routed("MATCH (n:SqlColumn) RETURN COUNT(n) AS count", {})
|
|
138
|
+
col_count = col_count_result[0]["count"] if col_count_result else 0
|
|
119
139
|
|
|
120
|
-
if
|
|
140
|
+
if col_count == 0:
|
|
121
141
|
console.print(
|
|
122
|
-
"[yellow]
|
|
123
|
-
"
|
|
142
|
+
"[yellow]Column lineage not available. Tools trace_column_lineage, "
|
|
143
|
+
"get_downstream_dependencies, and get_upstream_dependencies "
|
|
144
|
+
"will return empty results.[/yellow]"
|
|
124
145
|
)
|
|
125
|
-
else:
|
|
126
|
-
col_count_result = backend.run_read(
|
|
127
|
-
"MATCH (n:SqlColumn) RETURN COUNT(n) AS count", {}
|
|
128
|
-
)
|
|
129
|
-
col_count = col_count_result[0]["count"] if col_count_result else 0
|
|
130
|
-
|
|
131
|
-
if col_count == 0:
|
|
132
|
-
console.print(
|
|
133
|
-
"[yellow]Column lineage not available. Tools trace_column_lineage, "
|
|
134
|
-
"get_downstream_dependencies, and get_upstream_dependencies "
|
|
135
|
-
"will return empty results.[/yellow]"
|
|
136
|
-
)
|
|
137
|
-
|
|
138
|
-
# Print COLUMN_LINEAGE edges count
|
|
139
|
-
edges_result = backend.run_read(
|
|
140
|
-
"MATCH ()-[r:COLUMN_LINEAGE]->() RETURN COUNT(r) AS count", {}
|
|
141
|
-
)
|
|
142
|
-
edges_count = edges_result[0]["count"] if edges_result else 0
|
|
143
|
-
console.print(f" COLUMN_LINEAGE edges: {edges_count}")
|
|
144
146
|
|
|
145
|
-
|
|
146
|
-
|
|
147
|
+
# Print COLUMN_LINEAGE edges count
|
|
148
|
+
edges_result = run_read_routed("MATCH ()-[r:COLUMN_LINEAGE]->() RETURN COUNT(r) AS count", {})
|
|
149
|
+
edges_count = edges_result[0]["count"] if edges_result else 0
|
|
150
|
+
console.print(f" COLUMN_LINEAGE edges: {edges_count}")
|
|
147
151
|
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
console.print(f" STAR_SOURCE edges: {star_source_count}")
|
|
152
|
+
# Print star resolution metrics (T-07)
|
|
153
|
+
from sqlcg.core.queries import COUNT_STAR_EXPANSIONS_QUERY, COUNT_STAR_SOURCES_QUERY
|
|
151
154
|
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
+
star_source_result = run_read_routed(COUNT_STAR_SOURCES_QUERY, {})
|
|
156
|
+
star_source_count = star_source_result[0]["n"] if star_source_result else 0
|
|
157
|
+
console.print(f" STAR_SOURCE edges: {star_source_count}")
|
|
155
158
|
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
159
|
+
star_expansion_result = run_read_routed(COUNT_STAR_EXPANSIONS_QUERY, {})
|
|
160
|
+
star_expansion_count = star_expansion_result[0]["n"] if star_expansion_result else 0
|
|
161
|
+
console.print(f" STAR_EXPANSION lineage edges: {star_expansion_count}")
|
|
162
|
+
|
|
163
|
+
# Print parsing mode distribution
|
|
164
|
+
mode_query = (
|
|
165
|
+
"MATCH (q:SqlQuery) RETURN q.parsing_mode AS mode, COUNT(q) AS cnt ORDER BY cnt DESC"
|
|
166
|
+
)
|
|
167
|
+
mode_rows = run_read_routed(mode_query, {})
|
|
168
|
+
if mode_rows and "mode" in mode_rows[0]:
|
|
169
|
+
console.print("\n Parsing mode distribution:")
|
|
170
|
+
for row in mode_rows:
|
|
171
|
+
console.print(f" {row['mode']}: {row['cnt']}")
|
|
165
172
|
|
|
166
173
|
|
|
167
174
|
@app.command("list-repos")
|
|
168
175
|
def list_repos() -> None:
|
|
169
176
|
"""List all indexed repositories."""
|
|
170
|
-
|
|
171
|
-
result = backend.run_read("MATCH (r:Repo) RETURN r.path AS path, r.name AS name", {})
|
|
177
|
+
result = run_read_routed("MATCH (r:Repo) RETURN r.path AS path, r.name AS name", {})
|
|
172
178
|
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
179
|
+
if not result:
|
|
180
|
+
console.print("[yellow]No repositories indexed[/yellow]")
|
|
181
|
+
else:
|
|
182
|
+
from rich.table import Table
|
|
177
183
|
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
184
|
+
table = Table("Path", "Name")
|
|
185
|
+
for row in result:
|
|
186
|
+
table.add_row(str(row.get("path", "")), str(row.get("name", "")))
|
|
187
|
+
console.print(table)
|
sqlcg/cli/commands/find.py
CHANGED
|
@@ -4,8 +4,8 @@ import typer
|
|
|
4
4
|
from rich.console import Console
|
|
5
5
|
from rich.table import Table
|
|
6
6
|
|
|
7
|
-
from sqlcg.core.config import get_backend
|
|
8
7
|
from sqlcg.core.schema import NodeLabel
|
|
8
|
+
from sqlcg.server.read_client import run_read_routed
|
|
9
9
|
|
|
10
10
|
app = typer.Typer(help="Search the graph")
|
|
11
11
|
console = Console()
|
|
@@ -18,21 +18,20 @@ def find_table( # noqa: B008
|
|
|
18
18
|
) -> None:
|
|
19
19
|
"""Find a table by name."""
|
|
20
20
|
name = name.lower() # graph keys are lowercased at index time (C2 normalization)
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
from sqlcg.server.noise_filter import NoiseFilter
|
|
21
|
+
results = run_read_routed(
|
|
22
|
+
f"MATCH (t:{NodeLabel.TABLE}) WHERE t.qualified CONTAINS $name "
|
|
23
|
+
"RETURN t.qualified AS qualified, t.kind AS kind LIMIT 50",
|
|
24
|
+
{"name": name},
|
|
25
|
+
)
|
|
26
|
+
if not raw:
|
|
27
|
+
from sqlcg.server.noise_filter import NoiseFilter
|
|
29
28
|
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
29
|
+
nf = NoiseFilter.from_config() # repo_root=None → falls back to Path.cwd()
|
|
30
|
+
ids = [r["qualified"] for r in results]
|
|
31
|
+
kept, _ = nf.filter_nodes(ids)
|
|
32
|
+
kept_set = set(kept)
|
|
33
|
+
results = [r for r in results if r["qualified"] in kept_set]
|
|
34
|
+
_print_table(results, ["qualified", "kind"])
|
|
36
35
|
|
|
37
36
|
|
|
38
37
|
@app.command("column")
|
|
@@ -42,18 +41,17 @@ def find_column( # noqa: B008
|
|
|
42
41
|
) -> None:
|
|
43
42
|
"""Find a column by table.column reference."""
|
|
44
43
|
ref = ref.lower() # graph keys are lowercased at index time (C2 normalization)
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
from sqlcg.server.noise_filter import NoiseFilter
|
|
44
|
+
results = run_read_routed(
|
|
45
|
+
f"MATCH (c:{NodeLabel.COLUMN}) WHERE c.id CONTAINS $ref RETURN c.id AS id LIMIT 50",
|
|
46
|
+
{"ref": ref},
|
|
47
|
+
)
|
|
48
|
+
if not raw:
|
|
49
|
+
from sqlcg.server.noise_filter import NoiseFilter
|
|
52
50
|
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
51
|
+
nf = NoiseFilter.from_config() # repo_root=None → falls back to Path.cwd()
|
|
52
|
+
# Filter on the schema.table portion of each column id (schema.table.column)
|
|
53
|
+
results = [r for r in results if not nf.is_noise(r["id"].rsplit(".", 1)[0])]
|
|
54
|
+
_print_table(results, ["id"])
|
|
57
55
|
|
|
58
56
|
|
|
59
57
|
@app.command("pattern")
|
|
@@ -61,13 +59,12 @@ def find_pattern( # noqa: B008
|
|
|
61
59
|
pattern: str = typer.Argument(..., help="SQL pattern to search for"), # noqa: B008
|
|
62
60
|
) -> None:
|
|
63
61
|
"""Find queries containing a SQL pattern."""
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
_print_table(results, ["id", "kind"])
|
|
62
|
+
results = run_read_routed(
|
|
63
|
+
f"MATCH (q:{NodeLabel.QUERY}) WHERE q.sql CONTAINS $pattern "
|
|
64
|
+
"RETURN q.id AS id, q.kind AS kind LIMIT 50",
|
|
65
|
+
{"pattern": pattern},
|
|
66
|
+
)
|
|
67
|
+
_print_table(results, ["id", "kind"])
|
|
71
68
|
|
|
72
69
|
|
|
73
70
|
def _print_table(rows: list[dict], columns: list[str]) -> None:
|
sqlcg/cli/commands/gain.py
CHANGED
|
@@ -7,8 +7,8 @@ from pathlib import Path
|
|
|
7
7
|
import typer
|
|
8
8
|
from rich.console import Console
|
|
9
9
|
|
|
10
|
-
from sqlcg.core.config import get_backend
|
|
11
10
|
from sqlcg.metrics import store as metrics_module
|
|
11
|
+
from sqlcg.server.read_client import run_read_routed
|
|
12
12
|
from sqlcg.utils.logging import getLogger
|
|
13
13
|
|
|
14
14
|
logger = getLogger(__name__)
|
|
@@ -120,19 +120,21 @@ def gain_cmd(
|
|
|
120
120
|
)
|
|
121
121
|
execute_cypher_ratio = execute_cypher_count / total_calls if total_calls > 0 else 0
|
|
122
122
|
|
|
123
|
-
# Section F: parse quality from graph
|
|
123
|
+
# Section F: parse quality from graph.
|
|
124
|
+
# run_read_routed raises typer.Exit (Exception-derived, NOT SystemExit) on
|
|
125
|
+
# server-busy timeout, so the except-Exception block degrades gracefully
|
|
126
|
+
# (skips the parse-quality section) instead of crashing gain (WARNING 3).
|
|
124
127
|
parse_quality: dict[str, int] | None = None
|
|
125
128
|
try:
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
parse_quality = {str(r["mode"]): int(r["cnt"]) for r in mode_rows}
|
|
129
|
+
mode_rows = run_read_routed(
|
|
130
|
+
"MATCH (q:SqlQuery) RETURN q.parsing_mode AS mode,"
|
|
131
|
+
" COUNT(q) AS cnt ORDER BY cnt DESC",
|
|
132
|
+
{},
|
|
133
|
+
)
|
|
134
|
+
if mode_rows and "mode" in mode_rows[0]:
|
|
135
|
+
parse_quality = {str(r["mode"]): int(r["cnt"]) for r in mode_rows}
|
|
134
136
|
except Exception:
|
|
135
|
-
pass # graph not available — skip quality section
|
|
137
|
+
pass # graph not available or server busy — skip quality section
|
|
136
138
|
|
|
137
139
|
if json_output:
|
|
138
140
|
payload: dict = {
|
sqlcg/core/config.py
CHANGED
|
@@ -350,18 +350,13 @@ def get_backend(read_only: bool = False) -> "GraphBackend":
|
|
|
350
350
|
"""Get a graph backend instance respecting the SQLCG_BACKEND env var.
|
|
351
351
|
|
|
352
352
|
Args:
|
|
353
|
-
read_only: Open in read-only mode.
|
|
354
|
-
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
process-level; a ``read_only=True`` open still fails with
|
|
361
|
-
"Database is locked" when a writer is active. Reads during an
|
|
362
|
-
active writer remain a known limitation (future work: route reads
|
|
363
|
-
through the live MCP server).
|
|
364
|
-
Neo4j has no single-writer lock; this flag is a no-op there.
|
|
353
|
+
read_only: Open the database in read-only mode. For KuzuBackend this
|
|
354
|
+
enables multiple concurrent read-only opens (reader/reader
|
|
355
|
+
concurrency), but does NOT allow reads while a read-write writer
|
|
356
|
+
holds the exclusive process lock — that requires routing through the
|
|
357
|
+
live MCP server via ``read_client.run_read_routed`` (v1.2.0).
|
|
358
|
+
Ignored for Neo4jBackend (Neo4j has no single-writer process lock;
|
|
359
|
+
the flag is a no-op and the normal connection is opened).
|
|
365
360
|
All writer call sites (index, reindex, db init/reset, server
|
|
366
361
|
init_backend) use the default ``False``.
|
|
367
362
|
|
|
@@ -370,6 +365,13 @@ def get_backend(read_only: bool = False) -> "GraphBackend":
|
|
|
370
365
|
|
|
371
366
|
Raises:
|
|
372
367
|
ValueError: If backend type is not recognized
|
|
368
|
+
|
|
369
|
+
Note:
|
|
370
|
+
CLI read commands (find, analyze, db info, gain) route through a live
|
|
371
|
+
MCP server via ``read_client.run_read_routed`` (v1.2.0) when a server
|
|
372
|
+
is live, falling back to ``get_backend(read_only=True)`` when no server
|
|
373
|
+
is present. The fallback path still contends for the process lock under
|
|
374
|
+
an active writer (Windows / no-server fallback only).
|
|
373
375
|
"""
|
|
374
376
|
backend_type = os.getenv("SQLCG_BACKEND", "kuzu")
|
|
375
377
|
|
|
@@ -396,7 +398,7 @@ def get_backend(read_only: bool = False) -> "GraphBackend":
|
|
|
396
398
|
from sqlcg.core.neo4j_backend import Neo4jBackend
|
|
397
399
|
|
|
398
400
|
neo4j_cfg = Neo4jConfig.from_env()
|
|
399
|
-
# Neo4j
|
|
401
|
+
# read_only is ignored for Neo4j — no single-writer process lock.
|
|
400
402
|
return Neo4jBackend(neo4j_cfg.uri, neo4j_cfg.user, neo4j_cfg.password)
|
|
401
403
|
else:
|
|
402
404
|
raise ValueError(f"Unknown backend type: {backend_type}")
|
sqlcg/indexer/indexer.py
CHANGED
|
@@ -93,8 +93,11 @@ def _flush_row_batch(db: GraphBackend, buf: BatchRowBuffer) -> None:
|
|
|
93
93
|
This is the v1.1.1 batch-flush core: called once per batch (not once per file).
|
|
94
94
|
Dedup keys mirror the graph's MERGE cardinality:
|
|
95
95
|
- file_rows: path (primary key)
|
|
96
|
-
- table_rows: qualified (primary key); prefers row with non-empty
|
|
97
|
-
defined_in_file so DEFINED_IN provenance is preserved
|
|
96
|
+
- table_rows: qualified (primary key); prefers (1) row with non-empty
|
|
97
|
+
defined_in_file so DEFINED_IN provenance is preserved;
|
|
98
|
+
(2) structural kind ('cte','derived','external') over
|
|
99
|
+
default 'table' so CTE aliases keep kind='cte' even when
|
|
100
|
+
also seen as source references with the default kind.
|
|
98
101
|
- column_rows: id (primary key)
|
|
99
102
|
- query_rows: id (primary key, globally unique path:index)
|
|
100
103
|
- edge rows: (src_key, dst_key) only — matches MERGE (src)-[r]->(dst)
|
|
@@ -108,12 +111,27 @@ def _flush_row_batch(db: GraphBackend, buf: BatchRowBuffer) -> None:
|
|
|
108
111
|
# --- Phase B: batch-scoped dedup ---
|
|
109
112
|
# For table_rows, prefer defined rows (non-empty defined_in_file) so provenance
|
|
110
113
|
# is not lost when a shared table is referenced by multiple files.
|
|
114
|
+
# Also prefer structurally-assigned kinds ('cte', 'derived', 'external') over the
|
|
115
|
+
# default 'table' kind: a CTE alias emitted first as a source reference (kind='table'
|
|
116
|
+
# default) and later confirmed as a CTE destination (kind='cte') must keep 'cte' so
|
|
117
|
+
# the kind filter correctly excludes it from default filtered output.
|
|
118
|
+
_structural_kinds = {"cte", "derived", "external"}
|
|
111
119
|
table_dedup: dict[str, dict] = {}
|
|
112
120
|
for r in buf.table_rows:
|
|
113
121
|
key = r["qualified"]
|
|
114
122
|
existing = table_dedup.get(key)
|
|
115
|
-
if existing is None
|
|
123
|
+
if existing is None:
|
|
116
124
|
table_dedup[key] = r
|
|
125
|
+
else:
|
|
126
|
+
# Rule 1: prefer rows with defined_in_file (DDL provenance)
|
|
127
|
+
if not existing.get("defined_in_file") and r.get("defined_in_file"):
|
|
128
|
+
table_dedup[key] = r
|
|
129
|
+
# Rule 2: prefer structural kind over default 'table'
|
|
130
|
+
elif (
|
|
131
|
+
existing.get("kind", "table") not in _structural_kinds
|
|
132
|
+
and r.get("kind", "table") in _structural_kinds
|
|
133
|
+
):
|
|
134
|
+
table_dedup[key] = r
|
|
117
135
|
table_rows = list(table_dedup.values())
|
|
118
136
|
|
|
119
137
|
column_rows = list({r["id"]: r for r in buf.column_rows}.values())
|
|
@@ -432,7 +450,14 @@ class Indexer:
|
|
|
432
450
|
Note: sqlcg watch's reindex_file uses a separate code path with
|
|
433
451
|
its own short per-file transaction. PERF-BATCH only affects index_repo.
|
|
434
452
|
"""
|
|
435
|
-
self._upsert_file_batch(
|
|
453
|
+
self._upsert_file_batch(
|
|
454
|
+
batch,
|
|
455
|
+
db,
|
|
456
|
+
defined_table_registry,
|
|
457
|
+
nonlocal_counts,
|
|
458
|
+
canonical_by_bare=aggregator.canonical_by_bare,
|
|
459
|
+
ambiguous_bare=aggregator._ambiguous_bare,
|
|
460
|
+
)
|
|
436
461
|
|
|
437
462
|
if profile:
|
|
438
463
|
_t_upsert_start = time.perf_counter()
|
|
@@ -975,6 +1000,8 @@ class Indexer:
|
|
|
975
1000
|
self,
|
|
976
1001
|
parsed: ParsedFile,
|
|
977
1002
|
defined_table_registry: dict[str, str] | None = None,
|
|
1003
|
+
canonical_by_bare: dict[str, str] | None = None,
|
|
1004
|
+
ambiguous_bare: set[str] | None = None,
|
|
978
1005
|
) -> FileRowSet:
|
|
979
1006
|
"""Build all row dicts for one ParsedFile — Phase A (pure, no db access).
|
|
980
1007
|
|
|
@@ -986,6 +1013,13 @@ class Indexer:
|
|
|
986
1013
|
Args:
|
|
987
1014
|
parsed: ParsedFile to build rows for
|
|
988
1015
|
defined_table_registry: Optional cross-file DDL dedup registry
|
|
1016
|
+
canonical_by_bare: Optional #44 bare-name → canonical full_id index
|
|
1017
|
+
(populated by CrossFileAggregator.register_pass1 from DDL tables).
|
|
1018
|
+
When provided, unqualified INSERT targets whose bare name maps to
|
|
1019
|
+
exactly one DDL canonical are rewritten to use the canonical full_id.
|
|
1020
|
+
When None, the rewrite is skipped (single-file / reindex_file path).
|
|
1021
|
+
ambiguous_bare: Optional set of bare names defined in >1 schema.
|
|
1022
|
+
These are never rewritten — the existing _bare_ref CLI hint handles them.
|
|
989
1023
|
|
|
990
1024
|
Returns:
|
|
991
1025
|
FileRowSet with all row lists and per-file counts/quality key
|
|
@@ -1201,15 +1235,53 @@ class Indexer:
|
|
|
1201
1235
|
rows.counts["star_sources"] += 1
|
|
1202
1236
|
|
|
1203
1237
|
# Upsert target table node (if not already a defined_table)
|
|
1204
|
-
# so that star expansion can create destination columns
|
|
1238
|
+
# so that star expansion can create destination columns.
|
|
1239
|
+
# #44: when a canonical_by_bare index is available, attempt to resolve
|
|
1240
|
+
# an unqualified / wrong-schema INSERT target to its DDL-canonical full_id
|
|
1241
|
+
# so that INSERT-target nodes share identity with the DDL node.
|
|
1205
1242
|
if stmt.target and stmt.target.full_id not in defined_table_ids:
|
|
1243
|
+
target_qualified = stmt.target.full_id
|
|
1244
|
+
target_name = stmt.target.name
|
|
1245
|
+
target_db = stmt.target.db or ""
|
|
1246
|
+
target_catalog = stmt.target.catalog or ""
|
|
1247
|
+
target_kind = "table"
|
|
1248
|
+
|
|
1249
|
+
# #44 canonical-name resolution: if bare name maps unambiguously to a
|
|
1250
|
+
# DDL-defined table, use the canonical full_id for the emitted row.
|
|
1251
|
+
# Degrading to no-op when canonical_by_bare is None (single-file path).
|
|
1252
|
+
if canonical_by_bare is not None:
|
|
1253
|
+
bare = (stmt.target.name or "").lower()
|
|
1254
|
+
if bare and bare not in (ambiguous_bare or set()) and bare in canonical_by_bare:
|
|
1255
|
+
# Resolve to the sole DDL canonical — keeps kind='table'
|
|
1256
|
+
canonical_id = canonical_by_bare[bare]
|
|
1257
|
+
target_qualified = canonical_id
|
|
1258
|
+
# Derive name/db/catalog from the canonical full_id parts.
|
|
1259
|
+
# full_id format: "db.name" or "catalog.db.name" or "name"
|
|
1260
|
+
parts = canonical_id.split(".")
|
|
1261
|
+
if len(parts) >= 3:
|
|
1262
|
+
target_name = parts[-1]
|
|
1263
|
+
target_db = parts[-2]
|
|
1264
|
+
target_catalog = parts[-3]
|
|
1265
|
+
elif len(parts) == 2:
|
|
1266
|
+
target_name = parts[-1]
|
|
1267
|
+
target_db = parts[-2]
|
|
1268
|
+
target_catalog = ""
|
|
1269
|
+
else:
|
|
1270
|
+
target_name = canonical_id
|
|
1271
|
+
target_db = ""
|
|
1272
|
+
target_catalog = ""
|
|
1273
|
+
else:
|
|
1274
|
+
# Not resolved to a DDL canonical — mark as derived so the
|
|
1275
|
+
# kind filter excludes it from default (non-raw) output (#45.2)
|
|
1276
|
+
target_kind = "derived"
|
|
1277
|
+
|
|
1206
1278
|
rows.table_rows.append(
|
|
1207
1279
|
{
|
|
1208
|
-
"qualified":
|
|
1209
|
-
"name":
|
|
1210
|
-
"catalog":
|
|
1211
|
-
"db":
|
|
1212
|
-
"kind":
|
|
1280
|
+
"qualified": target_qualified,
|
|
1281
|
+
"name": target_name,
|
|
1282
|
+
"catalog": target_catalog,
|
|
1283
|
+
"db": target_db,
|
|
1284
|
+
"kind": target_kind,
|
|
1213
1285
|
"defined_in_file": "",
|
|
1214
1286
|
}
|
|
1215
1287
|
)
|
|
@@ -1223,6 +1295,8 @@ class Indexer:
|
|
|
1223
1295
|
defined_table_registry: dict[str, str],
|
|
1224
1296
|
nonlocal_counts: dict,
|
|
1225
1297
|
warning_prefix: str = "",
|
|
1298
|
+
canonical_by_bare: dict[str, str] | None = None,
|
|
1299
|
+
ambiguous_bare: set[str] | None = None,
|
|
1226
1300
|
) -> None:
|
|
1227
1301
|
"""Accumulate rows for all files in batch, then flush once in one transaction.
|
|
1228
1302
|
|
|
@@ -1237,13 +1311,19 @@ class Indexer:
|
|
|
1237
1311
|
defined_table_registry: Cross-file DDL dedup registry
|
|
1238
1312
|
nonlocal_counts: Mutable summary dict updated in place (tables/edges/quality/…)
|
|
1239
1313
|
warning_prefix: Optional prefix for warning log messages (e.g. "resync_changed: ")
|
|
1314
|
+
canonical_by_bare: Optional #44 bare-name → canonical full_id index.
|
|
1315
|
+
When provided, unqualified INSERT targets are rewritten to their DDL
|
|
1316
|
+
canonical full_id. None on single-file / reindex_file paths.
|
|
1317
|
+
ambiguous_bare: Optional set of bare names defined in >1 schema.
|
|
1240
1318
|
"""
|
|
1241
1319
|
if not batch:
|
|
1242
1320
|
return
|
|
1243
1321
|
buf = BatchRowBuffer()
|
|
1244
1322
|
for parsed_in_batch in batch:
|
|
1245
1323
|
try:
|
|
1246
|
-
file_rows = self._build_file_rows(
|
|
1324
|
+
file_rows = self._build_file_rows(
|
|
1325
|
+
parsed_in_batch, defined_table_registry, canonical_by_bare, ambiguous_bare
|
|
1326
|
+
)
|
|
1247
1327
|
except Exception as exc:
|
|
1248
1328
|
logger.warning(
|
|
1249
1329
|
"%sFailed to build rows for %s: %s — skipping",
|